In [1]:
import numpy as np
np.random.seed(42)

# Pandas Basics

In [2]:
import pandas as pd

> Pandas allows for fast analysis, data cleaning and preparation.
> Pandas excels in performance and productivity
* conda install pandas
_OR_
* pip install pandas 

## Series

In [3]:
# Series is built on top of NumPy array

labels = ['a', 'b', 'c']
my_data = [1,2,3]

pd.Series(data=my_data, index=labels)

a    1
b    2
c    3
dtype: int64

In [4]:
# We can also pass Dictionary as data to Series
# In case of a dictionary, Pandas will automatically convert 'Key' as Index_Label and
# 'Value' as the corressponding Series Value

dic = { "First Name": "Rakshith", "Last Name" : "Ramagiri"}

In [5]:
dic

{'First Name': 'Rakshith', 'Last Name': 'Ramagiri'}

In [6]:
pd.Series(data=dic) # NOTE: Pass only the dictionary.

First Name    Rakshith
Last Name     Ramagiri
dtype: object

In [7]:
# we can also pass built-in function as data references to pd.Series

pd.Series(data=[sum, max, min])

0    <built-in function sum>
1    <built-in function max>
2    <built-in function min>
dtype: object

In [8]:
# Series Vectorized Operations tries to add elements by matching their index_labels

In [9]:
ser1 = pd.Series(data=[5,10,16], index=['a','b','c'])

In [10]:
ser2 = pd.Series(data=[3,7,9], index=['b','c','d'])

In [11]:
ser1 + ser2 # will add all matching index_label values, 
            # if no matching index_label is found, then it returns 'NaN'

a     NaN
b    13.0
c    23.0
d     NaN
dtype: float64

In [12]:
# NOTE: all integer types are automatically converted to 'float64' type

## DataFrames

In [13]:
from numpy.random import randn

In [14]:
df = pd.DataFrame(data=randn(5,4), index=['Row A','Row B','Row C','Row D','Row E'], columns=['Col 1','Col 2','Col 3','Col 4'])

In [15]:
df

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,0.496714,-0.138264,0.647689,1.52303
Row B,-0.234153,-0.234137,1.579213,0.767435
Row C,-0.469474,0.54256,-0.463418,-0.46573
Row D,0.241962,-1.91328,-1.724918,-0.562288
Row E,-1.012831,0.314247,-0.908024,-1.412304


In [16]:
# DataFrame is simply a bunch of Series that share the same indices

In [17]:
df['Col 1']

Row A    0.496714
Row B   -0.234153
Row C   -0.469474
Row D    0.241962
Row E   -1.012831
Name: Col 1, dtype: float64

In [18]:
type(df['Col 1']) # we can confirm the above statement using 'type()'

pandas.core.series.Series

In [19]:
type(df)

pandas.core.frame.DataFrame

In [20]:
# Selecting only a few columns
df[['Col 2', 'Col 4']]

Unnamed: 0,Col 2,Col 4
Row A,-0.138264,1.52303
Row B,-0.234137,0.767435
Row C,0.54256,-0.46573
Row D,-1.91328,-0.562288
Row E,0.314247,-1.412304


In [21]:
# Creating new columns inside an existing DataFrame.

df['new'] = df['Col 3'] + df['Col 4'] # similar to creating a new 'Key-Value' pair in dictionary

In [22]:
df

Unnamed: 0,Col 1,Col 2,Col 3,Col 4,new
Row A,0.496714,-0.138264,0.647689,1.52303,2.170718
Row B,-0.234153,-0.234137,1.579213,0.767435,2.346648
Row C,-0.469474,0.54256,-0.463418,-0.46573,-0.929147
Row D,0.241962,-1.91328,-1.724918,-0.562288,-2.287205
Row E,-1.012831,0.314247,-0.908024,-1.412304,-2.320328


In [23]:
# we can also DROP existing columns in DataFrame.

# Remove comment to see the 'Error'

#df.drop('new') # axis = 0 # by Default, DataFrame axis is set to ROW,
               # we need to change it to COLUMN to delete columns i.e, axis = 1

In [24]:
df.drop('new', axis=1) # this doesn't actually affect the DataFrame. Because 'drop' is not 'inplace' by default.

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,0.496714,-0.138264,0.647689,1.52303
Row B,-0.234153,-0.234137,1.579213,0.767435
Row C,-0.469474,0.54256,-0.463418,-0.46573
Row D,0.241962,-1.91328,-1.724918,-0.562288
Row E,-1.012831,0.314247,-0.908024,-1.412304


In [25]:
df.drop('new', axis=1, inplace=True) # this permanentely affects the DataFrame.

In [26]:
df # changes permanent

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,0.496714,-0.138264,0.647689,1.52303
Row B,-0.234153,-0.234137,1.579213,0.767435
Row C,-0.469474,0.54256,-0.463418,-0.46573
Row D,0.241962,-1.91328,-1.724918,-0.562288
Row E,-1.012831,0.314247,-0.908024,-1.412304


In [27]:
df.shape # to show DataFrame dimensions

(5, 4)

In [28]:
# as a result 'AXIS = 0' refers to 'ROWS' and 'AXIS = 1' refers to 'COLUMNS'

### Selecting ROWS in DataFrame.

In [29]:
# Label-based ROW selection

df.loc['Row A'] # 'loc' is a DataFrame method, yet it uses square_brackets '[]' weird??

Col 1    0.496714
Col 2   -0.138264
Col 3    0.647689
Col 4    1.523030
Name: Row A, dtype: float64

In [30]:
# as we can observe, the returned item is also a pd.Series object

In [31]:
# Index-based ROW selection

df.iloc[0]

Col 1    0.496714
Col 2   -0.138264
Col 3    0.647689
Col 4    1.523030
Name: Row A, dtype: float64

### Selecting Single Values in DataFrame

In [32]:
# Obtaining a single value.

df.loc['Row A', 'Col 1']

0.4967141530112327

In [33]:
# Obtaining sub-DataFrame from a DataFrame.

df.loc[ ['Row A', 'Row D'] , ['Col 2', 'Col 3']] # loc[ [ROW_SELECTION] , [COLUMN_SELECTION] ]

Unnamed: 0,Col 2,Col 3
Row A,-0.138264,0.647689
Row D,-1.91328,-1.724918


## Conditional Selection in Pandas

In [34]:
df > 0  # We get a boolean value for each cell, filled after each respective conditional check

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,True,False,True,True
Row B,False,False,True,True
Row C,False,True,False,False
Row D,True,False,False,False
Row E,False,True,False,False


In [35]:
df[df > 0] # passing a boolean DataFrame will result in 'NaN' values in place of 'False' values (during comparisons)

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,0.496714,,0.647689,1.52303
Row B,,,1.579213,0.767435
Row C,,0.54256,,
Row D,0.241962,,,
Row E,,0.314247,,


In [36]:
# To obtain a Series object after comparison

df['Col 4'] > 0

Row A     True
Row B     True
Row C    False
Row D    False
Row E    False
Name: Col 4, dtype: bool

In [39]:
# We generally apply conditional selection operation on a particular Column or Row
# to select only certain Rows or Columns of the DataFrame that satisfy the conditional statement.

df[ df['Col 4'] > 0] # this statement results in new DataFrame with conditional selection of Rows and Columns

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,0.496714,-0.138264,0.647689,1.52303
Row B,-0.234153,-0.234137,1.579213,0.767435


In [40]:
# In above result, 'Row C', 'Row D', 'Row E' has been eliminated since it's 'Col 4' value was not greater than '0'

In [41]:
df[df['Col 4'] > 0][['Col 1', 'Col 2']] # selecting required columns from resulting DataFrame of conditional operation on 'df'

Unnamed: 0,Col 1,Col 2
Row A,0.496714,-0.138264
Row B,-0.234153,-0.234137


## Multiple Conditional Selections on DataFrame

In [42]:
# following operation will result in a error

df[ (df['Col 4'] > 0) and (df['Col 3'] > 0) ]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [43]:
# this happens because, python's 'and' operator can only work with one pair Boolean objects only.
# i.e,

True and False

False

In [44]:
# But, in ** df[ (df['Col 4'] > 0) and (df['Col 3'] > 0) ] **
# we are trying operate on an entire DataFrame, hence the ValueError

In [45]:
# instead we can use '&' ampersand (for 'AND' operation) to join multiple conditions.
# '|' pipe operator for 'OR' operation.

df[ (df['Col 4'] > 0) & (df['Col 3'] > 0) ] # joining with 'AND' operator '&'

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,0.496714,-0.138264,0.647689,1.52303
Row B,-0.234153,-0.234137,1.579213,0.767435


In [46]:
df[ (df['Col 4'] > 0) | (df['Col 3'] > 0) ] # joining with 'OR' operator '|'

Unnamed: 0,Col 1,Col 2,Col 3,Col 4
Row A,0.496714,-0.138264,0.647689,1.52303
Row B,-0.234153,-0.234137,1.579213,0.767435


## Setting and Resetting Index

In [47]:
df.reset_index() # to reset ROW index

Unnamed: 0,index,Col 1,Col 2,Col 3,Col 4
0,Row A,0.496714,-0.138264,0.647689,1.52303
1,Row B,-0.234153,-0.234137,1.579213,0.767435
2,Row C,-0.469474,0.54256,-0.463418,-0.46573
3,Row D,0.241962,-1.91328,-1.724918,-0.562288
4,Row E,-1.012831,0.314247,-0.908024,-1.412304


In [48]:
# like most DataFrame operations, 'reset_index()' doesn't affect the original DataFrame.
# SET 'inplace' parameter to 'True' to affect the DataFrame permanently.

In [49]:
# As seen above, resetting using reset_index() will add a new 'index' column which stores old index's

In [50]:
df_newindex = df.reset_index()

In [51]:
df_newindex

Unnamed: 0,index,Col 1,Col 2,Col 3,Col 4
0,Row A,0.496714,-0.138264,0.647689,1.52303
1,Row B,-0.234153,-0.234137,1.579213,0.767435
2,Row C,-0.469474,0.54256,-0.463418,-0.46573
3,Row D,0.241962,-1.91328,-1.724918,-0.562288
4,Row E,-1.012831,0.314247,-0.908024,-1.412304


In [52]:
# Setting a new index

df_newindex.set_index('index')

Unnamed: 0_level_0,Col 1,Col 2,Col 3,Col 4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Row A,0.496714,-0.138264,0.647689,1.52303
Row B,-0.234153,-0.234137,1.579213,0.767435
Row C,-0.469474,0.54256,-0.463418,-0.46573
Row D,0.241962,-1.91328,-1.724918,-0.562288
Row E,-1.012831,0.314247,-0.908024,-1.412304


In [None]:
# set_index() method overwrites any existing 'index' of the DataFrame