In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Dataframe is basically multiple series that share the same index

In [4]:
# creating some dataframe with random normal data

data = np.random.randn(5, 4)
cols = ['A', 'B', 'C', 'D']

# creating dataframe (df) by seperatly passing data and column names
# we can also create dataframe using python dictionary
df = pd.DataFrame(data = data, columns=cols)

In [5]:
# cols A, B, C, D are Series object that share the same index
df

Unnamed: 0,A,B,C,D
0,0.026068,-0.295724,-0.104821,-0.564814
1,-0.175219,1.408621,-0.097051,-0.661102
2,1.031701,1.296165,-1.1285,-0.489374
3,-2.745024,-0.233757,0.477045,0.776651
4,-0.756652,0.806423,0.579384,0.177297


In [6]:
# Manually assigning index

index = 'U V W X Y'.split()

df.index = index

In [7]:
df

Unnamed: 0,A,B,C,D
U,0.026068,-0.295724,-0.104821,-0.564814
V,-0.175219,1.408621,-0.097051,-0.661102
W,1.031701,1.296165,-1.1285,-0.489374
X,-2.745024,-0.233757,0.477045,0.776651
Y,-0.756652,0.806423,0.579384,0.177297


In [8]:
# Now accessing a particular column
# it is just like python dictionary
df['A']

U    0.026068
V   -0.175219
W    1.031701
X   -2.745024
Y   -0.756652
Name: A, dtype: float64

In [9]:
# Now accessing multiple columns

df[ ['A', 'B', 'D'] ]

Unnamed: 0,A,B,D
U,0.026068,-0.295724,-0.564814
V,-0.175219,1.408621,-0.661102
W,1.031701,1.296165,-0.489374
X,-2.745024,-0.233757,0.776651
Y,-0.756652,0.806423,0.177297


In [10]:
# Creating new column 

# it will create new column called E and will do elementwise sum of cols A and B 
df['E'] = df['A'] + df['B']

In [11]:
df

Unnamed: 0,A,B,C,D,E
U,0.026068,-0.295724,-0.104821,-0.564814,-0.269656
V,-0.175219,1.408621,-0.097051,-0.661102,1.233403
W,1.031701,1.296165,-1.1285,-0.489374,2.327866
X,-2.745024,-0.233757,0.477045,0.776651,-2.978781
Y,-0.756652,0.806423,0.579384,0.177297,0.049771


In [12]:
# Now deleting a column

# by default it is set to delete rows ( i.e axis = 0 ), if you want to delete column set (axis = 1)
# also this will return new dataframe with E column removed, inorder to do it in same dataframe set inplace=True
df.drop('E', axis = 1, inplace = True)

In [13]:
df

Unnamed: 0,A,B,C,D
U,0.026068,-0.295724,-0.104821,-0.564814
V,-0.175219,1.408621,-0.097051,-0.661102
W,1.031701,1.296165,-1.1285,-0.489374
X,-2.745024,-0.233757,0.477045,0.776651
Y,-0.756652,0.806423,0.579384,0.177297


In [14]:
# For deleting particular row 
# say Xth row

# by default it is set to axis = 0
df.drop('X', inplace = True)

In [15]:
df

Unnamed: 0,A,B,C,D
U,0.026068,-0.295724,-0.104821,-0.564814
V,-0.175219,1.408621,-0.097051,-0.661102
W,1.031701,1.296165,-1.1285,-0.489374
Y,-0.756652,0.806423,0.579384,0.177297


In [16]:
# Now we know how to access col, add col, drop col and drop row

# Now accessing particular row
df.loc['U']

A    0.026068
B   -0.295724
C   -0.104821
D   -0.564814
Name: U, dtype: float64

In [17]:
# accessing multiple rows
df.loc[ ['U', 'Y'] ]

Unnamed: 0,A,B,C,D
U,0.026068,-0.295724,-0.104821,-0.564814
Y,-0.756652,0.806423,0.579384,0.177297


In [18]:
# Now instead of accessing row by index name, we can access by integer index

# iloc means integer based location
# it will return 0th and 2nd row
df.iloc[ [0, 2] ]

Unnamed: 0,A,B,C,D
U,0.026068,-0.295724,-0.104821,-0.564814
W,1.031701,1.296165,-1.1285,-0.489374


In [19]:
# Now accessing subset of data
# selecting A, B column and U, W row

# This will select first U, W row and this will return dataframe and then finally selecting A and B column
df.loc[ ['U', 'W'] ][ ['A', 'B'] ]

Unnamed: 0,A,B
U,0.026068,-0.295724
W,1.031701,1.296165


In [20]:
# Another way of doing this

# this is numpy like indexing  first rows, columns
df.loc[ ['U', 'W'], ['A', 'B'] ]

Unnamed: 0,A,B
U,0.026068,-0.295724
W,1.031701,1.296165


In [21]:
# Now performing some conditions on dataframe

# this will return boolean of dataframe
df > 0

Unnamed: 0,A,B,C,D
U,True,False,False,False
V,False,True,False,False
W,True,True,False,False
Y,False,True,True,True


In [22]:
# now getting values with this condition

# it will return values where df > 0, ( broadcasting is done here )
df[ df > 0 ]

Unnamed: 0,A,B,C,D
U,0.026068,,,
V,,1.408621,,
W,1.031701,1.296165,,
Y,,0.806423,0.579384,0.177297


In [23]:
# applying condition on specific column, and most of the times we will be doing that :D

df['A'] > 0

U     True
V    False
W     True
Y    False
Name: A, dtype: bool

In [24]:
# selecting all the rows where value of A > 0
df[ df['A'] > 0 ]

Unnamed: 0,A,B,C,D
U,0.026068,-0.295724,-0.104821,-0.564814
W,1.031701,1.296165,-1.1285,-0.489374


In [25]:
# if you want particular column

df[ df['A'] > 0 ][ ['A', 'C'] ]

Unnamed: 0,A,C
U,0.026068,-0.104821
W,1.031701,-1.1285


In [26]:
# applying mulitple conditions on dataframe
# and it is pandas requires that you keep all the conditions in the brackets
# df[ df['A'] < 0 & df['C'] < 0 ] -> this will return error as there are no brackets

# selecting all cols where A < 0 and C < 0; 
df[ (df['A'] < 0) & (df['C'] < 0) ]

Unnamed: 0,A,B,C,D
V,-0.175219,1.408621,-0.097051,-0.661102


In [27]:
# selecting all cols where A < 0 or C < 0; 

df[ (df['A'] < 0) | (df['C'] < 0) ]

Unnamed: 0,A,B,C,D
U,0.026068,-0.295724,-0.104821,-0.564814
V,-0.175219,1.408621,-0.097051,-0.661102
W,1.031701,1.296165,-1.1285,-0.489374
Y,-0.756652,0.806423,0.579384,0.177297


In [28]:
# now if you want to create the current index as a column
# by default it will create that new columns and name it as index
df.reset_index(inplace = True)

In [29]:
df

Unnamed: 0,index,A,B,C,D
0,U,0.026068,-0.295724,-0.104821,-0.564814
1,V,-0.175219,1.408621,-0.097051,-0.661102
2,W,1.031701,1.296165,-1.1285,-0.489374
3,Y,-0.756652,0.806423,0.579384,0.177297


In [30]:
# inorder to create a column as index
# setting the index column as index of df
df.set_index(df['index'], inplace = True)

In [31]:
df

Unnamed: 0_level_0,index,A,B,C,D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
U,U,0.026068,-0.295724,-0.104821,-0.564814
V,V,-0.175219,1.408621,-0.097051,-0.661102
W,W,1.031701,1.296165,-1.1285,-0.489374
Y,Y,-0.756652,0.806423,0.579384,0.177297


In [32]:
df.drop('index', axis = 1, inplace = True)

In [33]:
df

Unnamed: 0_level_0,A,B,C,D
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U,0.026068,-0.295724,-0.104821,-0.564814
V,-0.175219,1.408621,-0.097051,-0.661102
W,1.031701,1.296165,-1.1285,-0.489374
Y,-0.756652,0.806423,0.579384,0.177297


In [34]:
# now getting some information about dataframe

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, U to Y
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      float64
 1   B       4 non-null      float64
 2   C       4 non-null      float64
 3   D       4 non-null      float64
dtypes: float64(4)
memory usage: 160.0+ bytes


In [35]:
# the more descriptive stats 
df.describe()

Unnamed: 0,A,B,C,D
count,4.0,4.0,4.0,4.0
mean,0.031475,0.803871,-0.187747,-0.384498
std,0.744838,0.778287,0.704417,0.381067
min,-0.756652,-0.295724,-1.1285,-0.661102
25%,-0.320577,0.530886,-0.360741,-0.588886
50%,-0.074575,1.051294,-0.100936,-0.527094
75%,0.277476,1.324279,0.072058,-0.322706
max,1.031701,1.408621,0.579384,0.177297


In [36]:
# adding one categorical column to show some more useful and commonly used methods

df['categorical'] = ['first_category'] * 1 + ['second_category'] * 3

In [37]:
df

Unnamed: 0_level_0,A,B,C,D,categorical
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
U,0.026068,-0.295724,-0.104821,-0.564814,first_category
V,-0.175219,1.408621,-0.097051,-0.661102,second_category
W,1.031701,1.296165,-1.1285,-0.489374,second_category
Y,-0.756652,0.806423,0.579384,0.177297,second_category


In [38]:
# now getting the count of each category
df['categorical'].value_counts()

second_category    3
first_category     1
Name: categorical, dtype: int64

In [39]:
# Now dealing with missing data
# You can fill the missing data by 3 ways
# 1) keep the missing data as it is, if the forecasting algorithms can handle it
# 2) drop the entire sample, including the timestamp
# 3) replace it with the best estimate ( can differ based on problem, can use mean, median, or any method u r comformtable with )

# but fortunately most of the time series datasets will do not contain missing values
# as data is entered and timestamped

nan_df = pd.DataFrame({'A': [1, 2, np.nan, 4], 'B': [5, np.nan, 7, np.nan]})
nan_df

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,
2,,7.0
3,4.0,


In [40]:
# dropping all the rows containing nan values
nan_df.dropna()

Unnamed: 0,A,B
0,1.0,5.0


In [42]:
# to drop the column containing nan values, it will drop all the columns in this df :|
nan_df.dropna(axis = 1)

0
1
2
3


In [43]:
# now dropping rows if and only if it contains 'k' number of nan's
# here k = 2, means if row contains atleast 2 nan's then only drop that row
nan_df.dropna(thresh=2)

Unnamed: 0,A,B
0,1.0,5.0


In [41]:
# filling it with some estimate like mean
nan_df.fillna(value = nan_df.mean())

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,6.0
2,2.333333,7.0
3,4.0,6.0
