In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Dataframe is basically multiple series that share the same index

In [4]:
# creating some dataframe with random normal data

data = np.random.randn(5, 4)
cols = ['A', 'B', 'C', 'D']

# creating dataframe (df) by seperatly passing data and column names
# we can also create dataframe using python dictionary
df = pd.DataFrame(data = data, columns=cols)

In [5]:
# cols A, B, C, D are Series object that share the same index
df

Unnamed: 0,A,B,C,D
0,-0.023375,0.343985,-0.346258,-0.069926
1,0.246381,2.9905,-0.373869,-0.157011
2,-0.584769,0.463351,-2.462991,1.291161
3,-2.465684,-0.213666,0.407437,-1.96202
4,-0.340432,0.012877,-0.235633,-0.833351


In [6]:
# Manually assigning index

index = 'U V W X Y'.split()

df.index = index

In [7]:
df

Unnamed: 0,A,B,C,D
U,-0.023375,0.343985,-0.346258,-0.069926
V,0.246381,2.9905,-0.373869,-0.157011
W,-0.584769,0.463351,-2.462991,1.291161
X,-2.465684,-0.213666,0.407437,-1.96202
Y,-0.340432,0.012877,-0.235633,-0.833351


In [8]:
# Now accessing a particular column
# it is just like python dictionary
df['A']

U   -0.023375
V    0.246381
W   -0.584769
X   -2.465684
Y   -0.340432
Name: A, dtype: float64

In [9]:
# Now accessing multiple columns

df[ ['A', 'B', 'D'] ]

Unnamed: 0,A,B,D
U,-0.023375,0.343985,-0.069926
V,0.246381,2.9905,-0.157011
W,-0.584769,0.463351,1.291161
X,-2.465684,-0.213666,-1.96202
Y,-0.340432,0.012877,-0.833351


In [10]:
# Creating new column 

# it will create new column called E and will do elementwise sum of cols A and B 
df['E'] = df['A'] + df['B']

In [11]:
df

Unnamed: 0,A,B,C,D,E
U,-0.023375,0.343985,-0.346258,-0.069926,0.32061
V,0.246381,2.9905,-0.373869,-0.157011,3.236881
W,-0.584769,0.463351,-2.462991,1.291161,-0.121418
X,-2.465684,-0.213666,0.407437,-1.96202,-2.67935
Y,-0.340432,0.012877,-0.235633,-0.833351,-0.327554


In [12]:
# Now deleting a column

# by default it is set to delete rows ( i.e axis = 0 ), if you want to delete column set (axis = 1)
# also this will return new dataframe with E column removed, inorder to do it in same dataframe set inplace=True
df.drop('E', axis = 1, inplace = True)

In [16]:
df

Unnamed: 0,A,B,C,D
U,-0.023375,0.343985,-0.346258,-0.069926
V,0.246381,2.9905,-0.373869,-0.157011
W,-0.584769,0.463351,-2.462991,1.291161
Y,-0.340432,0.012877,-0.235633,-0.833351


In [14]:
# For deleting particular row 
# say Xth row

# by default it is set to axis = 0
df.drop('X', inplace = True)

In [15]:
df

Unnamed: 0,A,B,C,D
U,-0.023375,0.343985,-0.346258,-0.069926
V,0.246381,2.9905,-0.373869,-0.157011
W,-0.584769,0.463351,-2.462991,1.291161
Y,-0.340432,0.012877,-0.235633,-0.833351


In [17]:
# Now we know how to access col, add col, drop col and drop row

# Now accessing particular row
df.loc['U']

A   -0.023375
B    0.343985
C   -0.346258
D   -0.069926
Name: U, dtype: float64

In [18]:
# accessing multiple rows
df.loc[ ['U', 'Y'] ]

Unnamed: 0,A,B,C,D
U,-0.023375,0.343985,-0.346258,-0.069926
Y,-0.340432,0.012877,-0.235633,-0.833351


In [19]:
# Now instead of accessing row by index name, we can access by integer index

# iloc means integer based location
# it will return 0th and 2nd row
df.iloc[ [0, 2] ]

Unnamed: 0,A,B,C,D
U,-0.023375,0.343985,-0.346258,-0.069926
W,-0.584769,0.463351,-2.462991,1.291161


In [20]:
# Now accessing subset of data
# selecting A, B column and U, W row

# This will select first U, W row and this will return dataframe and then finally selecting A and B column
df.loc[ ['U', 'W'] ][ ['A', 'B'] ]

Unnamed: 0,A,B
U,-0.023375,0.343985
W,-0.584769,0.463351


In [21]:
# Another way of doing this

# this is numpy like indexing  first rows, columns
df.loc[ ['U', 'W'], ['A', 'B'] ]

Unnamed: 0,A,B
U,-0.023375,0.343985
W,-0.584769,0.463351


In [22]:
# Now performing some conditions on dataframe

# this will return boolean of dataframe
df > 0

Unnamed: 0,A,B,C,D
U,False,True,False,False
V,True,True,False,False
W,False,True,False,True
Y,False,True,False,False


In [23]:
# now getting values with this condition

# it will return values where df > 0, ( broadcasting is done here )
df[ df > 0 ]

Unnamed: 0,A,B,C,D
U,,0.343985,,
V,0.246381,2.9905,,
W,,0.463351,,1.291161
Y,,0.012877,,


In [24]:
# applying condition on specific column, and most of the times we will be doing that :D

df['A'] > 0

U    False
V     True
W    False
Y    False
Name: A, dtype: bool

In [25]:
# selecting all the rows where value of A > 0
df[ df['A'] > 0 ]

Unnamed: 0,A,B,C,D
V,0.246381,2.9905,-0.373869,-0.157011


In [26]:
# if you want particular column

df[ df['A'] > 0 ][ ['A', 'C'] ]

Unnamed: 0,A,C
V,0.246381,-0.373869


In [30]:
# applying mulitple conditions on dataframe
# and it is pandas requires that you keep all the conditions in the brackets
# df[ df['A'] < 0 & df['C'] < 0 ] -> this will return error as there are no brackets

# selecting all cols where A < 0 and C < 0; 
df[ (df['A'] < 0) & (df['C'] < 0) ]

Unnamed: 0,A,B,C,D
U,-0.023375,0.343985,-0.346258,-0.069926
W,-0.584769,0.463351,-2.462991,1.291161
Y,-0.340432,0.012877,-0.235633,-0.833351


In [33]:
# selecting all cols where A < 0 or C < 0; 

df[ (df['A'] < 0) | (df['C'] < 0) ]

Unnamed: 0,A,B,C,D
U,-0.023375,0.343985,-0.346258,-0.069926
V,0.246381,2.9905,-0.373869,-0.157011
W,-0.584769,0.463351,-2.462991,1.291161
Y,-0.340432,0.012877,-0.235633,-0.833351


In [39]:
# now if you want to create the current index as a column
# by default it will create that new columns and name it as index
df.reset_index(inplace = True)

In [40]:
df

Unnamed: 0,index,A,B,C,D
0,U,-0.023375,0.343985,-0.346258,-0.069926
1,V,0.246381,2.9905,-0.373869,-0.157011
2,W,-0.584769,0.463351,-2.462991,1.291161
3,Y,-0.340432,0.012877,-0.235633,-0.833351


In [41]:
# inorder to create a column as index
# setting the index column as index of df
df.set_index(df['index'], inplace = True)

In [42]:
df

Unnamed: 0_level_0,index,A,B,C,D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
U,U,-0.023375,0.343985,-0.346258,-0.069926
V,V,0.246381,2.9905,-0.373869,-0.157011
W,W,-0.584769,0.463351,-2.462991,1.291161
Y,Y,-0.340432,0.012877,-0.235633,-0.833351


In [43]:
df.drop('index', axis = 1, inplace = True)

In [44]:
df

Unnamed: 0_level_0,A,B,C,D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U,-0.023375,0.343985,-0.346258,-0.069926
V,0.246381,2.9905,-0.373869,-0.157011
W,-0.584769,0.463351,-2.462991,1.291161
Y,-0.340432,0.012877,-0.235633,-0.833351


In [45]:
# now getting some information about dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, U to Y
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      float64
 1   B       4 non-null      float64
 2   C       4 non-null      float64
 3   D       4 non-null      float64
dtypes: float64(4)
memory usage: 160.0+ bytes


In [46]:
# the more descriptive stats 
df.describe()

Unnamed: 0,A,B,C,D
count,4.0,4.0,4.0,4.0
mean,-0.175549,0.952678,-0.854688,0.057718
std,0.363239,1.371847,1.073865,0.890278
min,-0.584769,0.012877,-2.462991,-0.833351
25%,-0.401516,0.261208,-0.896149,-0.326096
50%,-0.181903,0.403668,-0.360063,-0.113469
75%,0.044064,1.095138,-0.318602,0.270345
max,0.246381,2.9905,-0.235633,1.291161


In [47]:
# adding one categorical column to show some more useful and commonly used methods

df['categorical'] = ['first_category'] * 1 + ['second_category'] * 3

In [48]:
df

Unnamed: 0_level_0,A,B,C,D,categorical
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
U,-0.023375,0.343985,-0.346258,-0.069926,first_category
V,0.246381,2.9905,-0.373869,-0.157011,second_category
W,-0.584769,0.463351,-2.462991,1.291161,second_category
Y,-0.340432,0.012877,-0.235633,-0.833351,second_category


In [49]:
# now getting the count of each category
df['categorical'].value_counts()

second_category    3
first_category     1
Name: categorical, dtype: int64