Sort by index, value and nth value

In [15]:
import pandas as pd
pd.options.display.width = 1000
import numpy as np

df = pd.DataFrame(np.random.randn(4,2), index=[3,1,0,2], columns = ['c1','c2'])
print("The original DataFrame is not sorted")
print(df)
print("-------------------------------------")

sorted_df = df.sort_index()
print("The original DataFrame is sorted by index")
print(sorted_df)
print("-------------------------------------")

print("The original DataFrame is sorted descending by index")
sorted_df = df.sort_index(ascending=False)
print(sorted_df)

# access first row
print("Access first row")
print(df[:1])
print(type(df[:1]))
# access index value of first row
print("Access index value of first row")
print(df[:1].index.values)

# access c1 column of first row
print("Access c1 column of first row")
print(df[:1]['c1'])

The original DataFrame is not sorted
         c1        c2
3 -0.402331 -0.706287
1  1.224897  2.752433
0  1.432855  1.173118
2  0.035276 -1.570964
-------------------------------------
The original DataFrame is sorted by index
         c1        c2
0  1.432855  1.173118
1  1.224897  2.752433
2  0.035276 -1.570964
3 -0.402331 -0.706287
-------------------------------------
The original DataFrame is sorted descending by index
         c1        c2
3 -0.402331 -0.706287
2  0.035276 -1.570964
1  1.224897  2.752433
0  1.432855  1.173118
Access first row
         c1        c2
3 -0.402331 -0.706287
<class 'pandas.core.frame.DataFrame'>
Access index value of first row
[3]
Access c1 column of first row
3   -0.402331
Name: c1, dtype: float64


In [2]:
# sort by multiple columns
print("The original DataFrame is sorted by multiple columns")
sorted_df = df.sort_values(by=['c1','c2'])
print(sorted_df)

The original DataFrame is sorted by multiple columns
         c1        c2
2 -2.683913 -0.322671
1 -1.263823 -0.751876
0  0.906307 -0.811958
3  2.256119 -0.896606


In [3]:
# get 3 largest values BASED on column c1
print(df)
print("The 3 largest values in column c1")
sorted_df = df.nlargest(3, 'c1')
print(sorted_df)

         c1        c2
3  2.256119 -0.896606
1 -1.263823 -0.751876
0  0.906307 -0.811958
2 -2.683913 -0.322671
The 3 largest values in column c1
         c1        c2
3  2.256119 -0.896606
0  0.906307 -0.811958
1 -1.263823 -0.751876


Apply

In [4]:
d = {"a": [1,2,3,4],"b":[2,3,4,5],"c":[3,4,5,6]}
df = pd.DataFrame(d)
print("The original DataFrame")
print(df)
print("--------------------------------------")
df["a_times_two"] = df["a"].apply(lambda s:s*2)
print("A new column \"a_times_two\" is added")
print(df)

The original DataFrame
   a  b  c
0  1  2  3
1  2  3  4
2  3  4  5
3  4  5  6
--------------------------------------
A new column "a_times_two" is added
   a  b  c  a_times_two
0  1  2  3            2
1  2  3  4            4
2  3  4  5            6
3  4  5  6            8


In [6]:
# apply using function
def times_two(s):
    return s*2
df["a_times_two"] = df["a"].apply(times_two)
print("A new column \"a_times_two\" is added")
print(df)

A new column "a_times_two" is added
   a  b  c  a_times_two
0  1  2  3            2
1  2  3  4            4
2  3  4  5            6
3  4  5  6            8


To do apply on multiple column, need to pass axis=1 ==> entire row will be send to function

In [7]:
d = {"a": [1,2,3,4],"b":[2,3,4,5],"c":[3,4,5,6]}
df = pd.DataFrame(d)
print("The original DataFrame")
print(df)
print("--------------------------------------")
print("Column a and b can be accessed as attributes")
df["a_plus_b"] = df.apply(lambda row:row.a + row.b, axis=1)
print("A new column \"a_plus_b\" is added")
print(df)

The original DataFrame
   a  b  c
0  1  2  3
1  2  3  4
2  3  4  5
3  4  5  6
--------------------------------------
Column a and b can be accessed as attributes
A new column "a_plus_b" is added
   a  b  c  a_plus_b
0  1  2  3         3
1  2  3  4         5
2  3  4  5         7
3  4  5  6         9


In [8]:
# apply using function
def add_a_b(row):
    return row.a + row.b
df["a_plus_b"] = df.apply(add_a_b, axis=1)
print("A new column \"a_plus_b\" is added")
print(df)

A new column "a_plus_b" is added
   a  b  c  a_plus_b
0  1  2  3         3
1  2  3  4         5
2  3  4  5         7
3  4  5  6         9


In [10]:
# the best way to add a new column is to add column directly
df["a_plus_b"] = df["a"] + df["b"]
print("A new column \"a_plus_b\" is added")
print(df)


A new column "a_plus_b" is added
   a  b  c  a_plus_b
0  1  2  3         3
1  2  3  4         5
2  3  4  5         7
3  4  5  6         9
