In [215]:
import numpy as np
import pandas as pd

# Series

In [216]:
# Series - Series is similar to numpy arrays except that we can label the index in series.

In [217]:
mydata = [10,20,30,40,50]
mylabels = ['L1','L2','L3','L4','L5']

In [219]:
mylabels

['L1', 'L2', 'L3', 'L4', 'L5']

In [130]:
mydata

[10, 20, 30, 40, 50]

In [131]:
mylabels

['L1', 'L2', 'L3', 'L4', 'L5']

In [132]:
# Creating a series using list

In [133]:
myser = pd.Series(mydata)

In [134]:
myser

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [135]:
# Creating a series with labeled index

In [136]:
myser = pd.Series(mydata, index=mylabels )

In [137]:
myser

L1    10
L2    20
L3    30
L4    40
L5    50
dtype: int64

In [138]:
# Create a series with 
# data = 1000, 750, 400, 1200, 2500
# labels = mobilephone, headphone, earphone, hometheatre, laptop

In [139]:
# Creating a series using numpy array

In [140]:
myarr = np.array(mydata)

In [141]:
myarr

array([10, 20, 30, 40, 50])

In [142]:
myser = pd.Series(myarr)

In [143]:
myser

0    10
1    20
2    30
3    40
4    50
dtype: int32

In [144]:
myser = pd.Series(myarr, mylabels)

In [145]:
myser

L1    10
L2    20
L3    30
L4    40
L5    50
dtype: int32

In [146]:
myser

L1    10
L2    20
L3    30
L4    40
L5    50
dtype: int32

In [147]:
myser[['L1','L4']]

L1    10
L4    40
dtype: int32

In [148]:
myser > 30

L1    False
L2    False
L3    False
L4     True
L5     True
dtype: bool

In [149]:
myser[myser>30]

L4    40
L5    50
dtype: int32

In [150]:
type(myser)

pandas.core.series.Series

In [151]:
myser.dtype

dtype('int32')

In [152]:
# Creating Series using Dictionaries - System assign keys as labels automatically

In [153]:
price_dict = {
'laptop':1000,
'mobile':700,
'headphone':300
}

In [154]:
price_dict

{'laptop': 1000, 'mobile': 700, 'headphone': 300}

In [155]:
myser = pd.Series(price_dict)

In [156]:
myser

laptop       1000
mobile        700
headphone     300
dtype: int64

In [157]:
book_store1 = pd.Series([5,10,0],['GOT','Harry Potter','Da Vinci Code'])
book_store2 = pd.Series([15,7,4],['GOT','Kite Runner','Da Vinci Code'])

In [158]:
book_store1 + book_store2

Da Vinci Code     4.0
GOT              20.0
Harry Potter      NaN
Kite Runner       NaN
dtype: float64

# DataFrames

In [159]:
mydata = np.array([[10,20,30],[40,50,60],[70,80,90]])
myrows = ['row1','row2','row3']
mycols = ['col1','col2','col3']

In [160]:
mydata

array([[10, 20, 30],
       [40, 50, 60],
       [70, 80, 90]])

In [161]:
df = pd.DataFrame(mydata,index=myrows,columns = mycols)

In [162]:
df

Unnamed: 0,col1,col2,col3
row1,10,20,30
row2,40,50,60
row3,70,80,90


In [163]:
# create a car dataset

In [164]:
pwd

'D:\\Users\\nitin'

In [165]:
import os

In [166]:
os.chdir('D:\\Users\\nitin\\datasets')

In [167]:
pwd

'D:\\Users\\nitin\\datasets'

In [168]:
os.listdir()

['cars.csv', 'Tele-Cust-Churn.csv']

In [169]:
os.getcwd()

'D:\\Users\\nitin\\datasets'

In [170]:
cardf = pd.read_csv('D:\\Users\\nitin\\datasets\\cars.csv',index_col=0)

In [171]:
os.chdir('D:\\Users\\nitin')

In [172]:
pwd

'D:\\Users\\nitin'

In [173]:
df

Unnamed: 0,col1,col2,col3
row1,10,20,30
row2,40,50,60
row3,70,80,90


In [174]:
df['col1']

row1    10
row2    40
row3    70
Name: col1, dtype: int32

In [175]:
type(df['col1'])

pandas.core.series.Series

In [176]:
type(df)

pandas.core.frame.DataFrame

In [177]:
df[['col1','col3']]

Unnamed: 0,col1,col3
row1,10,30
row2,40,60
row3,70,90


In [178]:
# creating a new column

In [179]:
df['new_col'] = [10,20,30]

In [180]:
df

Unnamed: 0,col1,col2,col3,new_col
row1,10,20,30,10
row2,40,50,60,20
row3,70,80,90,30


In [181]:
# in df.shape index 0 is 3 that is rows & index 1 is 4 that is columns 
df.shape

(3, 4)

In [182]:
# 0 = index, 1 = columns default is 0
df.drop('new_col',axis=1)

Unnamed: 0,col1,col2,col3
row1,10,20,30
row2,40,50,60
row3,70,80,90


In [183]:
df

Unnamed: 0,col1,col2,col3,new_col
row1,10,20,30,10
row2,40,50,60,20
row3,70,80,90,30


In [184]:

df.drop('new_col',axis=1, inplace=True)

In [185]:
df['newcol']=df['col1']*10 +df['col2']

In [186]:
df

Unnamed: 0,col1,col2,col3,newcol
row1,10,20,30,120
row2,40,50,60,450
row3,70,80,90,780


In [187]:
df.drop('newcol', axis = 1, inplace = True)

In [188]:
df

Unnamed: 0,col1,col2,col3
row1,10,20,30
row2,40,50,60
row3,70,80,90


In [189]:
# Selecting rows
# 1 by location loc
# 2 by index iloc

In [190]:
df.loc['row1']

col1    10
col2    20
col3    30
Name: row1, dtype: int32

In [191]:
df.loc[['row1','row3']]

Unnamed: 0,col1,col2,col3
row1,10,20,30
row3,70,80,90


In [192]:
df.iloc[['0','2']]

Unnamed: 0,col1,col2,col3
row1,10,20,30
row3,70,80,90


In [193]:
cardf

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [194]:
cardf.dtypes

mpg     float64
cyl       int64
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear      int64
carb      int64
dtype: object

In [195]:
type(cardf)

pandas.core.frame.DataFrame

In [196]:
df

Unnamed: 0,col1,col2,col3
row1,10,20,30
row2,40,50,60
row3,70,80,90


In [197]:
# selecting 50
df.loc['row2']['col2']

50

In [198]:
df[['col2','col3']]

Unnamed: 0,col2,col3
row1,20,30
row2,50,60
row3,80,90


In [199]:
df.loc[['row2','row3']]

Unnamed: 0,col1,col2,col3
row2,40,50,60
row3,70,80,90


In [200]:
df.loc[['row2','row3'],['col2','col3']]

Unnamed: 0,col2,col3
row2,50,60
row3,80,90


In [201]:
# Conditional Selection

In [202]:
cardf

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [203]:
cardf.iloc[0:5][['mpg','cyl']]

Unnamed: 0,mpg,cyl
Mazda RX4,21.0,6
Mazda RX4 Wag,21.0,6
Datsun 710,22.8,4
Hornet 4 Drive,21.4,6
Hornet Sportabout,18.7,8


In [204]:
cardf.iloc[0:5][['mpg','hp']]

Unnamed: 0,mpg,hp
Mazda RX4,21.0,110
Mazda RX4 Wag,21.0,110
Datsun 710,22.8,93
Hornet 4 Drive,21.4,110
Hornet Sportabout,18.7,175


In [205]:
cardf.iloc[0:5][['mpg','hp']]

Unnamed: 0,mpg,hp
Mazda RX4,21.0,110
Mazda RX4 Wag,21.0,110
Datsun 710,22.8,93
Hornet 4 Drive,21.4,110
Hornet Sportabout,18.7,175


In [206]:
cardf.iloc[0:5]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [207]:
cardf[cardf.columns[1:5]]

Unnamed: 0,cyl,disp,hp,drat
Mazda RX4,6,160.0,110,3.9
Mazda RX4 Wag,6,160.0,110,3.9
Datsun 710,4,108.0,93,3.85
Hornet 4 Drive,6,258.0,110,3.08
Hornet Sportabout,8,360.0,175,3.15
Valiant,6,225.0,105,2.76
Duster 360,8,360.0,245,3.21
Merc 240D,4,146.7,62,3.69
Merc 230,4,140.8,95,3.92
Merc 280,6,167.6,123,3.92


In [208]:
cardf.iloc[[1,2,4,6,-1]][cardf.columns[0:5]]

Unnamed: 0,mpg,cyl,disp,hp,drat
Mazda RX4 Wag,21.0,6,160.0,110,3.9
Datsun 710,22.8,4,108.0,93,3.85
Hornet Sportabout,18.7,8,360.0,175,3.15
Duster 360,14.3,8,360.0,245,3.21
Volvo 142E,21.4,4,121.0,109,4.11


In [209]:
cardf.iloc[0:10]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [210]:
car = cardf.iloc[0:10]

In [211]:
# Look for all values greater than zero
car>0

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,True,True,True,True,True,True,True,False,True,True,True
Mazda RX4 Wag,True,True,True,True,True,True,True,False,True,True,True
Datsun 710,True,True,True,True,True,True,True,True,True,True,True
Hornet 4 Drive,True,True,True,True,True,True,True,True,False,True,True
Hornet Sportabout,True,True,True,True,True,True,True,False,False,True,True
Valiant,True,True,True,True,True,True,True,True,False,True,True
Duster 360,True,True,True,True,True,True,True,False,False,True,True
Merc 240D,True,True,True,True,True,True,True,True,False,True,True
Merc 230,True,True,True,True,True,True,True,True,False,True,True
Merc 280,True,True,True,True,True,True,True,True,False,True,True


In [212]:
# Will give NaN for which values are less than or equal to zero
car[car>0]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,,1.0,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,,1.0,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1.0,1.0,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1.0,,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,,,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1.0,,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,,,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1.0,,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1.0,,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1.0,,4,4


In [213]:
# selecting only cars that have milage greater than 20 
# car['mpg']
# car['mpg']>20
car[car['mpg']>20]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2


In [214]:
# Select only those cars that have am equal to zero

In [215]:
car[car['am']==0]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [216]:
df = car[car['am']==0]

In [217]:
type(df)

pandas.core.frame.DataFrame

In [218]:
# as df is again a dataframe we can apply same methods on it

In [219]:
df[df['mpg']>20]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2


In [220]:
# doing same thing in one line
# very complicated
car[car['am']==0][car[car['am']==0]['mpg']>20]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2


In [221]:
# Multiple conditions
(car['mpg']>20) & (car['am']==1)

Mazda RX4             True
Mazda RX4 Wag         True
Datsun 710            True
Hornet 4 Drive       False
Hornet Sportabout    False
Valiant              False
Duster 360           False
Merc 240D            False
Merc 230             False
Merc 280             False
dtype: bool

In [222]:
car[(car['mpg']>20) & (car['am']==1)]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1


In [223]:
# for or use pipe operator
car[(car['mpg']>20) | (car['am']==1)]

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2


In [224]:
# Index Reset - Will reset the index from 0 onwards and present index will be  a column

In [225]:
car.reset_index()

Unnamed: 0,index,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
5,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
6,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
7,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
8,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
9,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [226]:
car.reset_index()[['index','mpg']]

Unnamed: 0,index,mpg
0,Mazda RX4,21.0
1,Mazda RX4 Wag,21.0
2,Datsun 710,22.8
3,Hornet 4 Drive,21.4
4,Hornet Sportabout,18.7
5,Valiant,18.1
6,Duster 360,14.3
7,Merc 240D,24.4
8,Merc 230,22.8
9,Merc 280,19.2


In [227]:
df

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [228]:
caralpha = ['a','b','c','d','e','f','g']

In [229]:
#'a b c d e f g'.split()

In [230]:
caralpha

['a', 'b', 'c', 'd', 'e', 'f', 'g']

In [231]:
# adding new columns

In [232]:
pd.options.mode.chained_assignment = None  # default='warn'

In [233]:
df['altname'] = caralpha

In [234]:
newcol = ['a','b','c','d','e','f','g']

In [235]:
df.loc[:,'newname'] = newcol

In [236]:
df

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,altname,newname
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1,a,a
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2,b,b
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1,c,c
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4,d,d
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2,e,e
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2,f,f
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4,g,g


In [237]:
df.drop('altname',axis = 1, inplace = True)

In [238]:
df

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,newname
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1,a
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2,b
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1,c
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4,d
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2,e
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2,f
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4,g


In [239]:
df.set_index('newname')

Unnamed: 0_level_0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
newname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
a,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
b,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
c,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
d,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
e,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
f,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
g,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [240]:
df

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb,newname
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1,a
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2,b
Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1,c
Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4,d
Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2,e
Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2,f
Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4,g


# Multi Index & Index Hierarchy 

In [241]:

myindex = [('ClassA', 2000), ('ClassA', 2020),
         ('ClassB', 2000), ('ClassB', 2020),
         ('ClassC', 2000), ('ClassC', 2020)]

In [242]:
myindex

[('ClassA', 2000),
 ('ClassA', 2020),
 ('ClassB', 2000),
 ('ClassB', 2020),
 ('ClassC', 2000),
 ('ClassC', 2020)]

In [243]:
students = [10, 30,15, 5,20, 50]

In [244]:
# Creating the MultiIndex 
no_class = pd.Series(students, index=myindex)

In [245]:
print(no_class)

(ClassA, 2000)    10
(ClassA, 2020)    30
(ClassB, 2000)    15
(ClassB, 2020)     5
(ClassC, 2000)    20
(ClassC, 2020)    50
dtype: int64


In [246]:
# better way of creating multiindiex 

In [247]:
index = pd.MultiIndex.from_tuples(myindex)
index

MultiIndex([('ClassA', 2000),
            ('ClassA', 2020),
            ('ClassB', 2000),
            ('ClassB', 2020),
            ('ClassC', 2000),
            ('ClassC', 2020)],
           )

In [248]:
# Creating the MultiIndex 
newclass = pd.Series(students, index=index)

In [249]:
# 1st two shows the index and last one the data
newclass

ClassA  2000    10
        2020    30
ClassB  2000    15
        2020     5
ClassC  2000    20
        2020    50
dtype: int64

In [250]:
newclass['ClassB']

2000    15
2020     5
dtype: int64

In [251]:
no_class2

NameError: name 'no_class2' is not defined

In [None]:
new_class = no_class.reindex(index)

In [None]:
new_class

In [None]:
outter_index = ['G1','G1','G1','G2','G2','G2']
inner_index = [1,2,3,1,2,3]

In [None]:
hier_index = list(zip(outter_index,inner_index))

In [None]:
hier_index

In [None]:
hier_index_new = pd.MultiIndex.from_tuples(hier_index)

In [None]:
hier_index_new

In [None]:
from numpy.random import randn
df = pd.DataFrame(randn(6,2),hier_index_new,['A','B'])

In [None]:
df

In [None]:
myindex = [('ClassA', 2000), ('ClassA', 2020),
         ('ClassB', 2000), ('ClassB', 2020),
         ('ClassC', 2000), ('ClassC', 2020)]

In [None]:
students = [10, 30,15, 5,20, 50]

In [None]:
mult_index = pd.MultiIndex.from_tuples(myindex)

In [None]:
mult_index

In [None]:
students= [10, 100,15, 250,20, 150,30, 300,50, 150,20, 500]

In [None]:

students_arr = np.array(students).reshape(6,2)

In [None]:
students_arr

In [None]:
dfsch = pd.DataFrame(students_arr, mult_index, ['dps','mps'])

In [None]:
dfsch

In [None]:
# Selecting data from classA

In [None]:
dfsch.loc['ClassA']

In [None]:
dfsch.loc['ClassA'].loc[2020]

In [None]:
# Naming the indexs

In [None]:
dfsch.index.names=['Class','Year']

In [None]:
dfsch

In [None]:
# Selecting students

In [None]:
dfsch.loc['ClassB']

In [None]:
dfsch.loc['ClassB'].loc[2000]

In [None]:
dfsch.loc['ClassB'].loc[2000]['mps']

In [None]:
# Useful for stock index data
# CrossSection - 

In [None]:
dfsch

In [None]:
# selecting data for year 2000 for all classes and all schools

In [None]:
dfsch.xs(2000,level = 'Year')

# Pandas - Missing Values

In [3]:
my_class = {
    'Bob':[90,95,np.nan],
    'Robin':[80,33,20],
    'Bill':[70,np.nan,np.nan]
}

In [4]:
my_class

{'Bob': [90, 95, nan], 'Robin': [80, 33, 20], 'Bill': [70, nan, nan]}

In [5]:
myclass = pd.DataFrame(my_class)

In [6]:
myclass

Unnamed: 0,Bob,Robin,Bill
0,90.0,80,70.0
1,95.0,33,
2,,20,


In [8]:
# checking out nulls
myclass.isna()

Unnamed: 0,Bob,Robin,Bill
0,False,False,False
1,False,False,True
2,True,False,True


In [9]:
myclass.isna().sum()

Bob      1
Robin    0
Bill     2
dtype: int64

In [14]:
#myclass.count()

In [15]:
#len(myclass)

In [16]:
#len(myclass)-myclass.count()

In [17]:
myclass.isnull()

Unnamed: 0,Bob,Robin,Bill
0,False,False,False
1,False,False,True
2,True,False,True


In [18]:
myclass.isnull().sum()

Bob      1
Robin    0
Bill     2
dtype: int64

In [19]:
myclass.isnull().sum().sum()

3

In [24]:
myclass.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
Bob      2 non-null float64
Robin    3 non-null int64
Bill     1 non-null float64
dtypes: float64(2), int64(1)
memory usage: 200.0 bytes


In [29]:
myclass['Bill'].value_counts(dropna=False)

NaN     2
70.0    1
Name: Bill, dtype: int64

In [31]:
myclass['Bill'].isnull().value_counts()

True     2
False    1
Name: Bill, dtype: int64

In [33]:
# default axis is 0 that is rows.
# will drop rows with null values
myclass.dropna()

Unnamed: 0,Bob,Robin,Bill
0,90.0,80,70.0


In [35]:
# Axis = 1 means columns
# Will drop columns with null values
myclass.dropna(axis=1)

Unnamed: 0,Robin
0,80
1,33
2,20


In [38]:
# Giving thresh - requires those many non na values
# than thresh
myclass.dropna(thresh=2)

Unnamed: 0,Bob,Robin,Bill
0,90.0,80,70.0
1,95.0,33,


In [39]:
myclass.dropna(axis=1,thresh=2)

Unnamed: 0,Bob,Robin
0,90.0,80
1,95.0,33
2,,20


In [40]:
myclass.dropna(axis=0,thresh=2)

Unnamed: 0,Bob,Robin,Bill
0,90.0,80,70.0
1,95.0,33,


In [42]:
# IF we want we can fillin missing values
myclass.fillna(value='No Marks')

Unnamed: 0,Bob,Robin,Bill
0,90,80,70
1,95,33,No Marks
2,No Marks,20,No Marks


In [None]:
# If we want to fillin with mean values
df['Bob'].fillna(value=df['Bob'].mean())

In [44]:
myclass['Bob'].mean()

92.5

In [45]:
# Skip na default is True
myclass['Bob'].mean(skipna=False)

nan

In [46]:
myclass['Bob'].sum(skipna=False)

nan

In [None]:
d = {
    'A':[1,2,np.nan],'B': [5,np.nan,np.nan],'C' : [1,2,3]
}

In [None]:
d

In [None]:
df = pd.DataFrame(d)

In [None]:
# droping rows that has null values

In [None]:
df

In [None]:
df.dropna()

In [None]:
# Default axis is 0 means across the rows.

In [None]:
df.dropna(axis = 1)

In [None]:
# threshhold - how many non na values to not to be droped

In [None]:
df.dropna(thresh = 2)

In [None]:
# filling null values
df.fillna(value = 'noval')

In [None]:
df

In [None]:
# filling values with the mean of the columns

In [None]:
df['A'].fillna(value = df['A'].mean())

In [None]:
df.drop('newname', axis = 1,inplace = True )

In [None]:
df

# Group By 
GroupBy - Function is used to split the data into groups based on some criteria & apply some aggregate funciton . Pandas objects can be split on any of their axes.

In [None]:
df = pd.read_csv('D:\\Users\\nitin\\datasets\Tele-Cust-Churn.csv',index_col=0)

In [None]:
df

In [None]:
df.head()

In [None]:
'''Groupby - We can group together rows based of a column & 
perform aggregate function'''

In [254]:
data = {
    'Company':['SamPhone','SamPhone','MIPhone','MIPhone','iPhone','iPhone'],
    'Person':['Jack','Tim','Jenifer','Amy','Sarah','Bob'],
    'Sales':[2000, 3000,2500,1004,9000,2500],
    'Qty':[10,16,12,30,10,12]
}

In [255]:
df = pd.DataFrame(data)

In [256]:
df

Unnamed: 0,Company,Person,Sales,Qty
0,SamPhone,Jack,2000,10
1,SamPhone,Tim,3000,16
2,MIPhone,Jenifer,2500,12
3,MIPhone,Amy,1004,30
4,iPhone,Sarah,9000,10
5,iPhone,Bob,2500,12


In [257]:
# I want to groupby on Company  - I want to do some aggregate fucntion on sales columns
df.groupby('Company')
# This will give us a groupby object in memory

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001E020BBF148>

In [258]:
grppby = df.groupby('Company')

In [259]:
# Now I can do aggregate function on this
grppby.mean()
# it will give me mean of sales column only as other column is a string

Unnamed: 0_level_0,Sales,Qty
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
MIPhone,1752,21
SamPhone,2500,13
iPhone,5750,11


In [260]:
# sum 
grppby.sum()

Unnamed: 0_level_0,Sales,Qty
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
MIPhone,3504,42
SamPhone,5000,26
iPhone,11500,22


In [261]:
grppby.std()

Unnamed: 0_level_0,Sales,Qty
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
MIPhone,1057.831745,12.727922
SamPhone,707.106781,4.242641
iPhone,4596.194078,1.414214


In [262]:
# Selecting any row
grppby.sum().loc['MIPhone']

Sales    3504
Qty        42
Name: MIPhone, dtype: int64

In [263]:
# Doing all the operations in one line
df.groupby('Company').sum()

Unnamed: 0_level_0,Sales,Qty
Company,Unnamed: 1_level_1,Unnamed: 2_level_1
MIPhone,3504,42
SamPhone,5000,26
iPhone,11500,22


In [264]:
df.groupby('Company').sum().loc['MIPhone']

Sales    3504
Qty        42
Name: MIPhone, dtype: int64

In [265]:
grppby.get_group('Bob')

KeyError: 'Bob'

In [266]:
df.groupby('Company').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Qty,Qty,Qty,Qty,Qty,Qty,Qty,Qty
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
MIPhone,2.0,1752.0,1057.831745,1004.0,1378.0,1752.0,2126.0,2500.0,2.0,21.0,12.727922,12.0,16.5,21.0,25.5,30.0
SamPhone,2.0,2500.0,707.106781,2000.0,2250.0,2500.0,2750.0,3000.0,2.0,13.0,4.242641,10.0,11.5,13.0,14.5,16.0
iPhone,2.0,5750.0,4596.194078,2500.0,4125.0,5750.0,7375.0,9000.0,2.0,11.0,1.414214,10.0,10.5,11.0,11.5,12.0


In [267]:
df.groupby('Company').describe().transpose()

Unnamed: 0,Company,MIPhone,SamPhone,iPhone
Sales,count,2.0,2.0,2.0
Sales,mean,1752.0,2500.0,5750.0
Sales,std,1057.831745,707.106781,4596.194078
Sales,min,1004.0,2000.0,2500.0
Sales,25%,1378.0,2250.0,4125.0
Sales,50%,1752.0,2500.0,5750.0
Sales,75%,2126.0,2750.0,7375.0
Sales,max,2500.0,3000.0,9000.0
Qty,count,2.0,2.0,2.0
Qty,mean,21.0,13.0,11.0


In [2]:
import pandas as pd

In [6]:
df = pd.read_csv('Tele-Cust-Churn.csv',index_col=0)

In [9]:
df["gender"].unique()

array(['Female', 'Male'], dtype=object)

In [10]:
df["gender"].nunique()

2

In [12]:
df.gender.value_counts()

Male      3555
Female    3488
Name: gender, dtype: int64

In [7]:
df.head()

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [270]:
df.tail()

Unnamed: 0_level_0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,No,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [271]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [272]:
df.shape

(7043, 20)

In [273]:
#Let me select few columns

In [280]:
df =  df[['gender', 'SeniorCitizen','tenure','InternetService','StreamingMovies','PaymentMethod','Contract','MonthlyCharges','Churn']]

In [291]:

df.head()

Unnamed: 0_level_0,gender,SeniorCitizen,tenure,InternetService,StreamingMovies,PaymentMethod,Contract,MonthlyCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7590-VHVEG,Female,0,1,DSL,No,Electronic check,Month-to-month,29.85,0
5575-GNVDE,Male,0,34,DSL,No,Mailed check,One year,56.95,0
3668-QPYBK,Male,0,2,DSL,No,Mailed check,Month-to-month,53.85,1
7795-CFOCW,Male,0,45,DSL,No,Bank transfer (automatic),One year,42.3,0
9237-HQITU,Female,0,2,Fiber optic,No,Electronic check,Month-to-month,70.7,1


In [288]:
#np.where(sample.housing.values == 'yes', 1, 0
df.Churn = np.where(df.Churn.values =='Yes',1,0)

In [292]:
df.head()

Unnamed: 0_level_0,gender,SeniorCitizen,tenure,InternetService,StreamingMovies,PaymentMethod,Contract,MonthlyCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7590-VHVEG,Female,0,1,DSL,No,Electronic check,Month-to-month,29.85,0
5575-GNVDE,Male,0,34,DSL,No,Mailed check,One year,56.95,0
3668-QPYBK,Male,0,2,DSL,No,Mailed check,Month-to-month,53.85,1
7795-CFOCW,Male,0,45,DSL,No,Bank transfer (automatic),One year,42.3,0
9237-HQITU,Female,0,2,Fiber optic,No,Electronic check,Month-to-month,70.7,1


In [293]:
# Finding out how much gender plays a role in churn
df.groupby('gender').mean()

Unnamed: 0_level_0,SeniorCitizen,tenure,MonthlyCharges,Churn
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,0.162844,32.244553,65.204243,0.269209
Male,0.161463,32.495359,64.327482,0.261603


In [299]:
df[['gender','Churn']].groupby('gender').mean()

Unnamed: 0_level_0,Churn
gender,Unnamed: 1_level_1
Female,0.269209
Male,0.261603


In [296]:
df.groupby(['gender','Churn']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,SeniorCitizen,tenure,MonthlyCharges
gender,Churn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Female,0,0.128678,37.858768,61.664908
Female,1,0.255591,17.00426,74.812087
Male,0,0.128762,37.289524,60.876914
Male,1,0.253763,18.963441,74.066989


In [308]:
# Looking at the churn by internet service
df[['InternetService','Churn']].groupby('InternetService').mean()

Unnamed: 0_level_0,Churn
InternetService,Unnamed: 1_level_1
DSL,0.189591
Fiber optic,0.418928
No,0.07405


In [310]:
# Lets sort the values
df[['InternetService','Churn']].groupby('InternetService').mean().sort_values('Churn')

Unnamed: 0_level_0,Churn
InternetService,Unnamed: 1_level_1
No,0.07405
DSL,0.189591
Fiber optic,0.418928


In [311]:
df[['InternetService','Churn']].groupby('InternetService').mean().sort_values('Churn', ascending = False)

Unnamed: 0_level_0,Churn
InternetService,Unnamed: 1_level_1
Fiber optic,0.418928
DSL,0.189591
No,0.07405


In [313]:
# We can groupby based on multiple columns
df[['Contract','SeniorCitizen','Churn']].groupby(['Contract','SeniorCitizen']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Churn
Contract,SeniorCitizen,Unnamed: 2_level_1
Month-to-month,0,0.395698
Month-to-month,1,0.546468
One year,0,0.106781
One year,1,0.152632
Two year,0,0.027097
Two year,1,0.041379


In [315]:
df[['gender','Contract','SeniorCitizen','Churn']].groupby(['gender','Contract','SeniorCitizen']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Churn
gender,Contract,SeniorCitizen,Unnamed: 3_level_1
Female,Month-to-month,0,0.406946
Female,Month-to-month,1,0.553885
Female,One year,0,0.095624
Female,One year,1,0.158416
Female,Two year,0,0.024453
Female,Two year,1,0.044118
Male,Month-to-month,0,0.384565
Male,Month-to-month,1,0.539216
Male,One year,0,0.117117
Male,One year,1,0.146067


In [318]:
df.groupby('gender')['Churn'].mean()

gender
Female    0.269209
Male      0.261603
Name: Churn, dtype: float64

In [319]:
df.groupby('gender')['Churn'].count()

gender
Female    3488
Male      3555
Name: Churn, dtype: int64

In [320]:
df.groupby('gender').count()

Unnamed: 0_level_0,SeniorCitizen,tenure,InternetService,StreamingMovies,PaymentMethod,Contract,MonthlyCharges,Churn
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,3488,3488,3488,3488,3488,3488,3488,3488
Male,3555,3555,3555,3555,3555,3555,3555,3555


In [321]:
df.groupby('gender').first()

Unnamed: 0_level_0,SeniorCitizen,tenure,InternetService,StreamingMovies,PaymentMethod,Contract,MonthlyCharges,Churn
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Female,0,1,DSL,No,Electronic check,Month-to-month,29.85,0
Male,0,34,DSL,No,Mailed check,One year,56.95,0


In [329]:
df.groupby('gender').get_group('Male')

Unnamed: 0_level_0,gender,SeniorCitizen,tenure,InternetService,StreamingMovies,PaymentMethod,Contract,MonthlyCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
5575-GNVDE,Male,0,34,DSL,No,Mailed check,One year,56.95,0
3668-QPYBK,Male,0,2,DSL,No,Mailed check,Month-to-month,53.85,1
7795-CFOCW,Male,0,45,DSL,No,Bank transfer (automatic),One year,42.30,0
1452-KIOVK,Male,0,22,Fiber optic,No,Credit card (automatic),Month-to-month,89.10,0
6388-TABGU,Male,0,62,DSL,No,Bank transfer (automatic),One year,56.15,0
...,...,...,...,...,...,...,...,...,...
9767-FFLEM,Male,0,38,Fiber optic,No,Credit card (automatic),Month-to-month,69.50,0
8456-QDAVC,Male,0,19,Fiber optic,No,Bank transfer (automatic),Month-to-month,78.70,0
6840-RESVB,Male,0,24,DSL,Yes,Mailed check,One year,84.80,0
8361-LTMKD,Male,1,4,Fiber optic,No,Mailed check,Month-to-month,74.40,1


In [325]:
df.head()

Unnamed: 0_level_0,gender,SeniorCitizen,tenure,InternetService,StreamingMovies,PaymentMethod,Contract,MonthlyCharges,Churn
customerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7590-VHVEG,Female,0,1,DSL,No,Electronic check,Month-to-month,29.85,0
5575-GNVDE,Male,0,34,DSL,No,Mailed check,One year,56.95,0
3668-QPYBK,Male,0,2,DSL,No,Mailed check,Month-to-month,53.85,1
7795-CFOCW,Male,0,45,DSL,No,Bank transfer (automatic),One year,42.3,0
9237-HQITU,Female,0,2,Fiber optic,No,Electronic check,Month-to-month,70.7,1


In [331]:
#  If the class distribution is not balanced, only checking the mean may cause false assumptions. 
# You can apply multiple aggregate functions on the result of groupby. They are not limited to 
# the only count and mean, you can pass the name of the functions as an argument to agg() function.
# Means for each group

In [334]:
df[['InternetService','tenure']].groupby('InternetService').mean()

Unnamed: 0_level_0,tenure
InternetService,Unnamed: 1_level_1
DSL,32.821561
Fiber optic,32.917959
No,30.547182


In [17]:
# Want both mean & count 
df[['InternetService','tenure']].groupby('InternetService').agg(['count','mean'])

Unnamed: 0_level_0,tenure,tenure
Unnamed: 0_level_1,count,mean
InternetService,Unnamed: 1_level_2,Unnamed: 2_level_2
DSL,2421,32.821561
Fiber optic,3096,32.917959
No,1526,30.547182


In [15]:
# as_index - True  - keys in groupby become index
df[['InternetService','tenure']].groupby('InternetService', as_index = False).agg(['count','mean'])

Unnamed: 0_level_0,tenure,tenure
Unnamed: 0_level_1,count,mean
InternetService,Unnamed: 1_level_2,Unnamed: 2_level_2
DSL,2421,32.821561
Fiber optic,3096,32.917959
No,1526,30.547182


In [14]:
# as_index - True  - keys in groupby become index
df[['InternetService','tenure']].groupby('InternetService', as_index = True).agg(['count','mean'])

Unnamed: 0_level_0,tenure,tenure
Unnamed: 0_level_1,count,mean
InternetService,Unnamed: 1_level_2,Unnamed: 2_level_2
DSL,2421,32.821561
Fiber optic,3096,32.917959
No,1526,30.547182


In [341]:
df[['gender','SeniorCitizen','tenure']].groupby(['gender','SeniorCitizen'], as_index = False).agg(['count','mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,tenure,tenure
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean
gender,SeniorCitizen,Unnamed: 2_level_2,Unnamed: 3_level_2
Female,0,2920,32.171233
Female,1,568,32.621479
Male,0,2981,32.21268
Male,1,574,33.963415


In [342]:
df[['gender','SeniorCitizen','tenure']].groupby(['gender','SeniorCitizen'], as_index = False).mean()

Unnamed: 0,gender,SeniorCitizen,tenure
0,Female,0,32.171233
1,Female,1,32.621479
2,Male,0,32.21268
3,Male,1,33.963415


In [16]:
df[['gender','SeniorCitizen','tenure']].groupby(['gender','SeniorCitizen'], as_index=True).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,tenure
gender,SeniorCitizen,Unnamed: 2_level_1
Female,0,32.171233
Female,1,32.621479
Male,0,32.21268
Male,1,33.963415


In [347]:
# Finding out the monthly charges for each contract type
df[['Contract','MonthlyCharges']].groupby('Contract').mean()
# long term contract have less monthly charges

Unnamed: 0_level_0,MonthlyCharges
Contract,Unnamed: 1_level_1
Month-to-month,66.39849
One year,65.048608
Two year,60.770413


In [350]:
# Finding out the monthly charges for each contract type
df[['Contract','MonthlyCharges','gender']].groupby(['Contract','gender']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,MonthlyCharges
Contract,gender,Unnamed: 2_level_1
Month-to-month,Female,66.652623
Month-to-month,Male,66.147615
One year,Female,66.841643
One year,Male,63.343444
Two year,Female,60.513373
Two year,Male,61.025941


In [352]:
# Looking at the distribution of contract type for different internet services
df.groupby(['InternetService','Contract']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,gender,SeniorCitizen,tenure,StreamingMovies,PaymentMethod,MonthlyCharges,Churn
InternetService,Contract,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
DSL,Month-to-month,1223,1223,1223,1223,1223,1223,1223
DSL,One year,570,570,570,570,570,570,570
DSL,Two year,628,628,628,628,628,628,628
Fiber optic,Month-to-month,2128,2128,2128,2128,2128,2128,2128
Fiber optic,One year,539,539,539,539,539,539,539
Fiber optic,Two year,429,429,429,429,429,429,429
No,Month-to-month,524,524,524,524,524,524,524
No,One year,364,364,364,364,364,364,364
No,Two year,638,638,638,638,638,638,638


# Merging, Joining & Concatenating

In [47]:
cars = pd.read_csv('D:\\Users\\nitin\\datasets\cars-grp.csv')

In [55]:
cars['carmodel']

0              Ford Mondeo LX
1              Ford Galaxy LX
2           Ford Fiesta 1.25i
3         Ford Ka 1.3 HCS EFi
4              Opel Astra 1.6
5             Opel Vectra 2.0
6              Opel Omega MV6
7     Volkswagen Golf GLS 2.0
8       Volkswagen Passat GLX
9      Volkswagen Beetle 2.0L
10          Daewoo Leganza SX
11           Daewoo Cielo GLE
12          Daewoo Nubira CDS
13            Daewoo Matiz SE
14     Mercedes M-Class ML320
15             Volvo V70 2.3L
16          BMW 3-series 330i
17           Renault Clio RXE
18             Renault Megane
19         Renault Scenic Rx4
20              Fiat Punto 75
21           Fiat Bravo SX-GT
22      Fiat Multipla 1.6 ELX
23             Dacia Nova GTi
24    Dacia SupeRNova Confort
25      Mercedes C-Class C320
26    Mercedes CL-Class CL500
27      Mercedes E-Class E320
28      Mercedes S-Class S500
29             Volvo C70 2.3L
30          BMW 5-series 540i
31          BMW 7-series 740i
Name: carmodel, dtype: object

In [56]:
mystr = 'Nitin monga'

In [57]:
# mystr.find(' ')

5

In [66]:
# mystr[:mystr.find(' ')]

'Nitin'

In [82]:
mystr.split(' ')[0]

'Nitin'

In [89]:
cars['comp'] = cars['carmodel'].str.split(' ').str[0]

In [98]:
mycols = cars.columns

In [100]:
print(mycols)

Index(['carmodel', 'symbol', 'price', 'engine', 'power', 'fuelcons', 'speed',
       'comp'],
      dtype='object')


In [101]:
mycols = ['comp','carmodel', 'symbol', 'price', 'engine', 'power', 'fuelcons', 'speed']
       

In [103]:
cars = cars[mycols]

In [104]:
cars

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
0,Ford,Ford Mondeo LX,FDM,22170.0,1989,144.0,11.3,
1,Ford,Ford Galaxy LX,FDG,27516.0,2792,201.0,14.8,219.0
2,Ford,Ford Fiesta 1.25i,FDF,,1242,74.0,8.7,167.0
3,Ford,Ford Ka 1.3 HCS EFi,FDK,13085.0,1299,59.0,8.3,155.0
4,Opel,Opel Astra 1.6,OPA,18105.0,1598,63.0,11.5,178.0
5,Opel,Opel Vectra 2.0,OPV,19946.0,1998,136.0,13.9,212.0
6,Opel,Opel Omega MV6,OPG,,2962,211.0,16.8,238.0
7,Volkswagen,Volkswagen Golf GLS 2.0,VWG,16350.0,1984,115.0,11.9,195.0
8,Volkswagen,Volkswagen Passat GLX,VWP,28750.0,2771,190.0,13.7,227.0
9,Volkswagen,Volkswagen Beetle 2.0L,VWB,20243.0,1984,,9.8,177.0


In [105]:
cars.isnull()

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
0,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False
2,False,False,False,True,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False
6,False,False,False,True,False,False,False,False
7,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False
9,False,False,False,False,False,True,False,False


In [107]:
cars.isnull().sum()

comp        0
carmodel    0
symbol      0
price       7
engine      0
power       3
fuelcons    2
speed       5
dtype: int64

In [111]:
cars.isnull().sum(axis =1)

0     1
1     0
2     1
3     0
4     0
5     0
6     1
7     0
8     0
9     1
10    1
11    0
12    0
13    0
14    0
15    1
16    1
17    0
18    1
19    0
20    0
21    1
22    0
23    0
24    2
25    2
26    0
27    2
28    1
29    0
30    1
31    0
dtype: int64

In [115]:
cars.set_index('carmodel').isnull().sum(axis = 1)

carmodel
Ford Mondeo LX             1
Ford Galaxy LX             0
Ford Fiesta 1.25i          1
Ford Ka 1.3 HCS EFi        0
Opel Astra 1.6             0
Opel Vectra 2.0            0
Opel Omega MV6             1
Volkswagen Golf GLS 2.0    0
Volkswagen Passat GLX      0
Volkswagen Beetle 2.0L     1
Daewoo Leganza SX          1
Daewoo Cielo GLE           0
Daewoo Nubira CDS          0
Daewoo Matiz SE            0
Mercedes M-Class ML320     0
Volvo V70 2.3L             1
BMW 3-series 330i          1
Renault Clio RXE           0
Renault Megane             1
Renault Scenic Rx4         0
Fiat Punto 75              0
Fiat Bravo SX-GT           1
Fiat Multipla 1.6 ELX      0
Dacia Nova GTi             0
Dacia SupeRNova Confort    2
Mercedes C-Class C320      2
Mercedes CL-Class CL500    0
Mercedes E-Class E320      2
Mercedes S-Class S500      1
Volvo C70 2.3L             0
BMW 5-series 540i          1
BMW 7-series 740i          0
dtype: int64

In [117]:
cars.isnull().sum().sum()

17

In [120]:
cars['comp'].unique()

array(['Ford', 'Opel', 'Volkswagen', 'Daewoo', 'Mercedes', 'Volvo', 'BMW',
       'Renault', 'Fiat', 'Dacia'], dtype=object)

In [121]:
# selecting cars models - 
# Ford, Opel, Volkswagen
# # Daewoo, Mdercedes, Volvo
# BMW, Renault, Fiat, Dacia

In [160]:
df = cars.set_index('comp')

In [161]:
df

Unnamed: 0_level_0,carmodel,symbol,price,engine,power,fuelcons,speed
comp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Ford,Ford Mondeo LX,FDM,22170.0,1989,144.0,11.3,
Ford,Ford Galaxy LX,FDG,27516.0,2792,201.0,14.8,219.0
Ford,Ford Fiesta 1.25i,FDF,,1242,74.0,8.7,167.0
Ford,Ford Ka 1.3 HCS EFi,FDK,13085.0,1299,59.0,8.3,155.0
Opel,Opel Astra 1.6,OPA,18105.0,1598,63.0,11.5,178.0
Opel,Opel Vectra 2.0,OPV,19946.0,1998,136.0,13.9,212.0
Opel,Opel Omega MV6,OPG,,2962,211.0,16.8,238.0
Volkswagen,Volkswagen Golf GLS 2.0,VWG,16350.0,1984,115.0,11.9,195.0
Volkswagen,Volkswagen Passat GLX,VWP,28750.0,2771,190.0,13.7,227.0
Volkswagen,Volkswagen Beetle 2.0L,VWB,20243.0,1984,,9.8,177.0


In [162]:
#df1 = df.loc[['Ford','Opel','Volkswagen']]

In [163]:
#df1

In [164]:
#df2 = df.loc[['Daewoo', 'Mercedes', 'Volvo']]

In [165]:
#df2

In [167]:
#df3 = df.loc[['BMW', 'Renault', 'Fiat', 'Dacia']]

In [168]:
#df3

In [171]:
# df = df.sort_values(by = ['comp'])
df.sort_index(inplace=True)


In [173]:
df.reset_index(inplace=True)

In [175]:
df

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
0,BMW,BMW 7-series 740i,BM7,62900.0,5379,326.0,18.1,206.0
1,BMW,BMW 3-series 330i,BM3,,2979,225.0,11.8,206.0
2,BMW,BMW 5-series 540i,BM5,58985.0,4398,282.0,,249.0
3,Dacia,Dacia SupeRNova Confort,DAS,,1390,74.0,,162.0
4,Dacia,Dacia Nova GTi,DAN,5000.0,1557,72.0,9.6,161.0
5,Daewoo,Daewoo Matiz SE,DWM,4792.0,796,51.0,7.4,144.0
6,Daewoo,Daewoo Nubira CDS,DWN,9222.0,1598,78.0,11.3,185.0
7,Daewoo,Daewoo Leganza SX,DWL,,1998,133.0,10.2,206.0
8,Daewoo,Daewoo Cielo GLE,DWI,6860.0,1498,80.0,10.5,170.0
9,Fiat,Fiat Multipla 1.6 ELX,FTM,18870.0,1581,103.0,9.4,170.0


In [179]:
df1  = df.iloc[0:9]

In [180]:
df1

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
0,BMW,BMW 7-series 740i,BM7,62900.0,5379,326.0,18.1,206.0
1,BMW,BMW 3-series 330i,BM3,,2979,225.0,11.8,206.0
2,BMW,BMW 5-series 540i,BM5,58985.0,4398,282.0,,249.0
3,Dacia,Dacia SupeRNova Confort,DAS,,1390,74.0,,162.0
4,Dacia,Dacia Nova GTi,DAN,5000.0,1557,72.0,9.6,161.0
5,Daewoo,Daewoo Matiz SE,DWM,4792.0,796,51.0,7.4,144.0
6,Daewoo,Daewoo Nubira CDS,DWN,9222.0,1598,78.0,11.3,185.0
7,Daewoo,Daewoo Leganza SX,DWL,,1998,133.0,10.2,206.0
8,Daewoo,Daewoo Cielo GLE,DWI,6860.0,1498,80.0,10.5,170.0


In [181]:
df2 = df.iloc[9:21]

In [182]:
df2

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
9,Fiat,Fiat Multipla 1.6 ELX,FTM,18870.0,1581,103.0,9.4,170.0
10,Fiat,Fiat Bravo SX-GT,FTB,15830.0,1581,,10.6,184.0
11,Fiat,Fiat Punto 75,FTP,14370.0,1242,73.0,7.8,170.0
12,Ford,Ford Mondeo LX,FDM,22170.0,1989,144.0,11.3,
13,Ford,Ford Ka 1.3 HCS EFi,FDK,13085.0,1299,59.0,8.3,155.0
14,Ford,Ford Fiesta 1.25i,FDF,,1242,74.0,8.7,167.0
15,Ford,Ford Galaxy LX,FDG,27516.0,2792,201.0,14.8,219.0
16,Mercedes,Mercedes M-Class ML320,MBM,33950.0,3199,233.0,18.2,182.0
17,Mercedes,Mercedes S-Class S500,MBS,85250.0,4966,,19.4,204.0
18,Mercedes,Mercedes E-Class E320,MBE,,3199,221.0,11.1,


In [183]:
df3 = df.loc[21:]

In [184]:
df3

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
21,Opel,Opel Omega MV6,OPG,,2962,211.0,16.8,238.0
22,Opel,Opel Vectra 2.0,OPV,19946.0,1998,136.0,13.9,212.0
23,Opel,Opel Astra 1.6,OPA,18105.0,1598,63.0,11.5,178.0
24,Renault,Renault Clio RXE,RCT,12542.0,1390,98.0,8.8,186.0
25,Renault,Renault Megane,RTM,17370.0,2965,250.0,9.2,
26,Renault,Renault Scenic Rx4,RTS,22000.0,1998,140.0,7.4,196.0
27,Volkswagen,Volkswagen Passat GLX,VWP,28750.0,2771,190.0,13.7,227.0
28,Volkswagen,Volkswagen Golf GLS 2.0,VWG,16350.0,1984,115.0,11.9,195.0
29,Volkswagen,Volkswagen Beetle 2.0L,VWB,20243.0,1984,,9.8,177.0
30,Volvo,Volvo C70 2.3L,VLC,50700.0,2319,247.0,11.2,249.0


# Concatenation
Simply concats dataframes together - columns has to be same and in same order

In [185]:
newdf = pd.concat([df1,df2,df3])

In [186]:
newdf

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
0,BMW,BMW 7-series 740i,BM7,62900.0,5379,326.0,18.1,206.0
1,BMW,BMW 3-series 330i,BM3,,2979,225.0,11.8,206.0
2,BMW,BMW 5-series 540i,BM5,58985.0,4398,282.0,,249.0
3,Dacia,Dacia SupeRNova Confort,DAS,,1390,74.0,,162.0
4,Dacia,Dacia Nova GTi,DAN,5000.0,1557,72.0,9.6,161.0
5,Daewoo,Daewoo Matiz SE,DWM,4792.0,796,51.0,7.4,144.0
6,Daewoo,Daewoo Nubira CDS,DWN,9222.0,1598,78.0,11.3,185.0
7,Daewoo,Daewoo Leganza SX,DWL,,1998,133.0,10.2,206.0
8,Daewoo,Daewoo Cielo GLE,DWI,6860.0,1498,80.0,10.5,170.0
9,Fiat,Fiat Multipla 1.6 ELX,FTM,18870.0,1581,103.0,9.4,170.0


In [196]:
dfc1_left = df[['comp','carmodel','symbol','price']]
dfc2_right = df[['carmodel','engine','power','fuelcons','speed']]

In [197]:
dfc1_left.head()

Unnamed: 0,comp,carmodel,symbol,price
0,BMW,BMW 7-series 740i,BM7,62900.0
1,BMW,BMW 3-series 330i,BM3,
2,BMW,BMW 5-series 540i,BM5,58985.0
3,Dacia,Dacia SupeRNova Confort,DAS,
4,Dacia,Dacia Nova GTi,DAN,5000.0


In [198]:
dfc2_right.head()

Unnamed: 0,carmodel,engine,power,fuelcons,speed
0,BMW 7-series 740i,5379,326.0,18.1,206.0
1,BMW 3-series 330i,2979,225.0,11.8,206.0
2,BMW 5-series 540i,4398,282.0,,249.0
3,Dacia SupeRNova Confort,1390,74.0,,162.0
4,Dacia Nova GTi,1557,72.0,9.6,161.0


In [199]:
# concatenating columns 
pd.concat([dfc1_left,dfc2_right], axis=1)

Unnamed: 0,comp,carmodel,symbol,price,carmodel.1,engine,power,fuelcons,speed
0,BMW,BMW 7-series 740i,BM7,62900.0,BMW 7-series 740i,5379,326.0,18.1,206.0
1,BMW,BMW 3-series 330i,BM3,,BMW 3-series 330i,2979,225.0,11.8,206.0
2,BMW,BMW 5-series 540i,BM5,58985.0,BMW 5-series 540i,4398,282.0,,249.0
3,Dacia,Dacia SupeRNova Confort,DAS,,Dacia SupeRNova Confort,1390,74.0,,162.0
4,Dacia,Dacia Nova GTi,DAN,5000.0,Dacia Nova GTi,1557,72.0,9.6,161.0
5,Daewoo,Daewoo Matiz SE,DWM,4792.0,Daewoo Matiz SE,796,51.0,7.4,144.0
6,Daewoo,Daewoo Nubira CDS,DWN,9222.0,Daewoo Nubira CDS,1598,78.0,11.3,185.0
7,Daewoo,Daewoo Leganza SX,DWL,,Daewoo Leganza SX,1998,133.0,10.2,206.0
8,Daewoo,Daewoo Cielo GLE,DWI,6860.0,Daewoo Cielo GLE,1498,80.0,10.5,170.0
9,Fiat,Fiat Multipla 1.6 ELX,FTM,18870.0,Fiat Multipla 1.6 ELX,1581,103.0,9.4,170.0


In [200]:
pd.concat([dfc1_left,dfc2_right], axis=1).head()

Unnamed: 0,comp,carmodel,symbol,price,carmodel.1,engine,power,fuelcons,speed
0,BMW,BMW 7-series 740i,BM7,62900.0,BMW 7-series 740i,5379,326.0,18.1,206.0
1,BMW,BMW 3-series 330i,BM3,,BMW 3-series 330i,2979,225.0,11.8,206.0
2,BMW,BMW 5-series 540i,BM5,58985.0,BMW 5-series 540i,4398,282.0,,249.0
3,Dacia,Dacia SupeRNova Confort,DAS,,Dacia SupeRNova Confort,1390,74.0,,162.0
4,Dacia,Dacia Nova GTi,DAN,5000.0,Dacia Nova GTi,1557,72.0,9.6,161.0


# Inner Join
Inner join is the most common type of join you’ll be working with. It returns a dataframe with only 
those rows that have common characteristics.

<img src="jyimages/inner.png" width = 300 height = 300 align="left">


In [207]:
dummy_data1 = {
        'id': ['1', '2', '3', '4', '5'],
        'Feature1': ['A', 'C', 'E', 'G', 'I'],
        'Feature2': ['B', 'D', 'F', 'H', 'J']}

In [223]:
df1 = pd.DataFrame(dummy_data1, columns = ['id', 'Feature1', 'Feature2'])

df1

Unnamed: 0,id,Feature1,Feature2
0,1,A,B
1,2,C,D
2,3,E,F
3,4,G,H
4,5,I,J


In [224]:
dummy_data2 = {
        'id': ['1', '2', '6', '7', '8'],
        'Feature1': ['K', 'M', 'O', 'Q', 'S'],
        'Feature2': ['L', 'N', 'P', 'R', 'T']}

In [225]:
df2 = pd.DataFrame(dummy_data2, columns = ['id', 'Feature1', 'Feature2'])

df2

Unnamed: 0,id,Feature1,Feature2
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T


In [226]:
df1

Unnamed: 0,id,Feature1,Feature2
0,1,A,B
1,2,C,D
2,3,E,F
3,4,G,H
4,5,I,J


In [227]:
df2

Unnamed: 0,id,Feature1,Feature2
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T


In [228]:
dummy_data3 = {
        'id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'Feature3': [12, 13, 14, 15, 16, 17, 15, 12, 13, 23]}

In [229]:
df3 = pd.DataFrame(dummy_data3, columns = ['id', 'Feature3'])

df3

Unnamed: 0,id,Feature3
0,1,12
1,2,13
2,3,14
3,4,15
4,5,16
5,7,17
6,8,15
7,9,12
8,10,13
9,11,23


In [231]:
df1

Unnamed: 0,id,Feature1,Feature2
0,1,A,B
1,2,C,D
2,3,E,F
3,4,G,H
4,5,I,J


In [232]:
df2

Unnamed: 0,id,Feature1,Feature2
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T


In [233]:
df3

Unnamed: 0,id,Feature3
0,1,12
1,2,13
2,3,14
3,4,15
4,5,16
5,7,17
6,8,15
7,9,12
8,10,13
9,11,23


In [234]:
# Concatenating DataFrames df1 & df2

In [236]:
# By default axis is 0 that means concatenate dataframes along the rows
pd.concat([df1,df2])

Unnamed: 0,id,Feature1,Feature2
0,1,A,B
1,2,C,D
2,3,E,F
3,4,G,H
4,5,I,J
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T


In [238]:
# However, the row labels seem to be wrong! If you want the row labels to adjust 
# automatically according to the join, you will have to set the argument ignore_index as True 
# look at the index
pd.concat([df1,df2], ignore_index=True)

Unnamed: 0,id,Feature1,Feature2
0,1,A,B
1,2,C,D
2,3,E,F
3,4,G,H
4,5,I,J
5,1,K,L
6,2,M,N
7,6,O,P
8,7,Q,R
9,8,S,T


In [240]:
# We can also label the DataFrames, after the concatenation, 
# with a key so that you may know which data came from which DataFrame
myconc = pd.concat([df1,df2],keys=['k1','k2'])


In [241]:
myconc

Unnamed: 0,Unnamed: 1,id,Feature1,Feature2
k1,0,1,A,B
k1,1,2,C,D
k1,2,3,E,F
k1,3,4,G,H
k1,4,5,I,J
k2,0,1,K,L
k2,1,2,M,N
k2,2,6,O,P
k2,3,7,Q,R
k2,4,8,S,T


In [246]:
# selecting data using keys
myconc.loc['k1'].iloc[2]

id          3
Feature1    E
Feature2    F
Name: 2, dtype: object

In [247]:
# selecting data using keys
myconc.loc['k1']['id']

0    1
1    2
2    3
3    4
4    5
Name: id, dtype: object

In [248]:
# concatenating using axis = 1
pd.concat([df1,df2], axis = 1)

Unnamed: 0,id,Feature1,Feature2,id.1,Feature1.1,Feature2.1
0,1,A,B,1,K,L
1,2,C,D,2,M,N
2,3,E,F,6,O,P
3,4,G,H,7,Q,R
4,5,I,J,8,S,T


# Merging the data frames

In [249]:
df_row = pd.concat([df1,df2])

In [250]:
df_row

Unnamed: 0,id,Feature1,Feature2
0,1,A,B
1,2,C,D
2,3,E,F
3,4,G,H
4,5,I,J
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T


In [251]:
df3

Unnamed: 0,id,Feature3
0,1,12
1,2,13
2,3,14
3,4,15
4,5,16
5,7,17
6,8,15
7,9,12
8,10,13
9,11,23


In [252]:
# Merging on the common column

In [253]:
df_merg_col= pd.merge(df_row, df3, on = 'id')

In [254]:
df_merg_col

Unnamed: 0,id,Feature1,Feature2,Feature3
0,1,A,B,12
1,1,K,L,12
2,2,C,D,13
3,2,M,N,13
4,3,E,F,14
5,4,G,H,15
6,5,I,J,16
7,7,Q,R,17
8,8,S,T,15


In [255]:
# If the columns have different names on which we are merging then we use left_on & right_on arguments
pd.merge(df_row, df3, left_on='id', right_on='id')

Unnamed: 0,id,Feature1,Feature2,Feature3
0,1,A,B,12
1,1,K,L,12
2,2,C,D,13
3,2,M,N,13
4,3,E,F,14
5,4,G,H,15
6,5,I,J,16
7,7,Q,R,17
8,8,S,T,15


In [256]:
# We can add or append rows to a DataFrame by passing a Series or dict

In [257]:
df_merg_col

Unnamed: 0,id,Feature1,Feature2,Feature3
0,1,A,B,12
1,1,K,L,12
2,2,C,D,13
3,2,M,N,13
4,3,E,F,14
5,4,G,H,15
6,5,I,J,16
7,7,Q,R,17
8,8,S,T,15


In [258]:
add_row = pd.Series(['10', 'X1', 'X2', 'X3'],
                    index=['id','Feature1', 'Feature2', 'Feature3'])


In [260]:
add_row

id          10
Feature1    X1
Feature2    X2
Feature3    X3
dtype: object

In [262]:

df_add_row = df_merg_col.append(add_row, ignore_index=True)

In [263]:
df_add_row

Unnamed: 0,id,Feature1,Feature2,Feature3
0,1,A,B,12
1,1,K,L,12
2,2,C,D,13
3,2,M,N,13
4,3,E,F,14
5,4,G,H,15
6,5,I,J,16
7,7,Q,R,17
8,8,S,T,15
9,10,X1,X2,X3


In [264]:
# Joining DataFrames

In [265]:
# Full Outer Join
# joined DataFrame will contain all records from both the DataFrames and fill in NaNs for missing matches on either side

<img src="jyimages/joins.png" width = 300 height = 300 align="left">

In [267]:
df_outer = pd.merge(df1, df2, on='id', how='outer')

In [269]:
df_outer
# can notice that the resulting DataFrame had all the entries from both the tables with NaN
# values for missing matches on either side. However, one more thing to notice is the suffix 
# which got appended to the column names to show which column came from which DataFrame. 
# The default suffixes are x and y, however, you can modify them by specifying the suffixes

Unnamed: 0,id,Feature1_x,Feature2_x,Feature1_y,Feature2_y
0,1,A,B,K,L
1,2,C,D,M,N
2,3,E,F,,
3,4,G,H,,
4,5,I,J,,
5,6,,,O,P
6,7,,,Q,R
7,8,,,S,T


In [271]:
df_outer = pd.merge(df1, df2, left_on='id',right_on='id',how='outer',suffixes=('_left','_right'))


In [272]:
df_outer

Unnamed: 0,id,Feature1_left,Feature2_left,Feature1_right,Feature2_right
0,1,A,B,K,L
1,2,C,D,M,N
2,3,E,F,,
3,4,G,H,,
4,5,I,J,,
5,6,,,O,P
6,7,,,Q,R
7,8,,,S,T


In [273]:
# The INNER JOIN produces only the set of records that match in both DataFrame A and DataFrame B. 

In [274]:
df_inner = pd.merge(df1, df2, on='id', how='inner')

In [275]:
df_inner

Unnamed: 0,id,Feature1_x,Feature2_x,Feature1_y,Feature2_y
0,1,A,B,K,L
1,2,C,D,M,N


In [281]:
# Right Join - 
# The RIGHT JOIN produces a complete set of records from DataFrame B (right DataFrame),
# with the matching records (where available) in DataFrame A (left DataFrame).
# If there is no match, the right side will contain null. 
# All elements of right dataframe

In [277]:
df_right = pd.merge(df1, df2, on='id', how='right')

In [278]:
df1

Unnamed: 0,id,Feature1,Feature2
0,1,A,B
1,2,C,D
2,3,E,F
3,4,G,H
4,5,I,J


In [279]:
df2

Unnamed: 0,id,Feature1,Feature2
0,1,K,L
1,2,M,N
2,6,O,P
3,7,Q,R
4,8,S,T


In [280]:
df_right

Unnamed: 0,id,Feature1_x,Feature2_x,Feature1_y,Feature2_y
0,1,A,B,K,L
1,2,C,D,M,N
2,6,,,O,P
3,7,,,Q,R
4,8,,,S,T


In [282]:
# Left Join
# complete set of records from DataFrame A (left DataFrame), with the matching records (where available) 
# in DataFrame B (right DataFrame). If there is no match, the left side will contain null. 

In [283]:
df_left = pd.merge(df1,df2, on ='id', how = 'left')

In [284]:
df_left

Unnamed: 0,id,Feature1_x,Feature2_x,Feature1_y,Feature2_y
0,1,A,B,K,L
1,2,C,D,M,N
2,3,E,F,,
3,4,G,H,,
4,5,I,J,,


In [285]:
# joining on indexes# 
# to perform the join on the indexes or the row labels. To do so, you have to specify right_index
# (for the indexes of the right DataFrame) and left_index (for the indexes of the left DataFrame) as True

In [287]:
df_index = pd.merge(df1, df2, right_index=True, left_index=True)

In [288]:
df_index

Unnamed: 0,id_x,Feature1_x,Feature2_x,id_y,Feature1_y,Feature2_y
0,1,A,B,1,K,L
1,2,C,D,2,M,N
2,3,E,F,6,O,P
3,4,G,H,7,Q,R
4,5,I,J,8,S,T


In [291]:
# Operations on Data Frames

In [292]:
cars

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
0,Ford,Ford Mondeo LX,FDM,22170.0,1989,144.0,11.3,
1,Ford,Ford Galaxy LX,FDG,27516.0,2792,201.0,14.8,219.0
2,Ford,Ford Fiesta 1.25i,FDF,,1242,74.0,8.7,167.0
3,Ford,Ford Ka 1.3 HCS EFi,FDK,13085.0,1299,59.0,8.3,155.0
4,Opel,Opel Astra 1.6,OPA,18105.0,1598,63.0,11.5,178.0
5,Opel,Opel Vectra 2.0,OPV,19946.0,1998,136.0,13.9,212.0
6,Opel,Opel Omega MV6,OPG,,2962,211.0,16.8,238.0
7,Volkswagen,Volkswagen Golf GLS 2.0,VWG,16350.0,1984,115.0,11.9,195.0
8,Volkswagen,Volkswagen Passat GLX,VWP,28750.0,2771,190.0,13.7,227.0
9,Volkswagen,Volkswagen Beetle 2.0L,VWB,20243.0,1984,,9.8,177.0


In [293]:
# Finding unique values 

In [295]:
cars['comp'].unique()

array(['Ford', 'Opel', 'Volkswagen', 'Daewoo', 'Mercedes', 'Volvo', 'BMW',
       'Renault', 'Fiat', 'Dacia'], dtype=object)

In [296]:
# Finding how many unique values
cars['comp'].nunique()

10

In [297]:
# Finding how many times value are repeated
cars['comp'].value_counts()

Mercedes      5
Daewoo        4
Ford          4
Opel          3
Renault       3
Fiat          3
BMW           3
Volkswagen    3
Volvo         2
Dacia         2
Name: comp, dtype: int64

In [298]:
# Apply Function
mystr = 'Nitin Monga'

In [299]:
mystr.split(' ')[0]

'Nitin'

In [301]:
def mysplit(x):
    myval = x.split(' ')[0]
    return(myval)

In [306]:
cars['carmodel'].apply(mysplit)

0           Ford
1           Ford
2           Ford
3           Ford
4           Opel
5           Opel
6           Opel
7     Volkswagen
8     Volkswagen
9     Volkswagen
10        Daewoo
11        Daewoo
12        Daewoo
13        Daewoo
14      Mercedes
15         Volvo
16           BMW
17       Renault
18       Renault
19       Renault
20          Fiat
21          Fiat
22          Fiat
23         Dacia
24         Dacia
25      Mercedes
26      Mercedes
27      Mercedes
28      Mercedes
29         Volvo
30           BMW
31           BMW
Name: carmodel, dtype: object

In [311]:
cars.drop('new_mod', axis = 1, inplace=True)

In [312]:
# Finding out hte length 
cars['comp'].apply(len)

0      4
1      4
2      4
3      4
4      4
5      4
6      4
7     10
8     10
9     10
10     6
11     6
12     6
13     6
14     8
15     5
16     3
17     7
18     7
19     7
20     4
21     4
22     4
23     5
24     5
25     8
26     8
27     8
28     8
29     5
30     3
31     3
Name: comp, dtype: int64

In [315]:
# Very common using lambda expression

In [320]:
cars['comp'].apply(lambda x :x.upper())

0           FORD
1           FORD
2           FORD
3           FORD
4           OPEL
5           OPEL
6           OPEL
7     VOLKSWAGEN
8     VOLKSWAGEN
9     VOLKSWAGEN
10        DAEWOO
11        DAEWOO
12        DAEWOO
13        DAEWOO
14      MERCEDES
15         VOLVO
16           BMW
17       RENAULT
18       RENAULT
19       RENAULT
20          FIAT
21          FIAT
22          FIAT
23         DACIA
24         DACIA
25      MERCEDES
26      MERCEDES
27      MERCEDES
28      MERCEDES
29         VOLVO
30           BMW
31           BMW
Name: comp, dtype: object

In [322]:
# removing columns
cars.drop('comp', axis = 1)

Unnamed: 0,carmodel,symbol,price,engine,power,fuelcons,speed
0,Ford Mondeo LX,FDM,22170.0,1989,144.0,11.3,
1,Ford Galaxy LX,FDG,27516.0,2792,201.0,14.8,219.0
2,Ford Fiesta 1.25i,FDF,,1242,74.0,8.7,167.0
3,Ford Ka 1.3 HCS EFi,FDK,13085.0,1299,59.0,8.3,155.0
4,Opel Astra 1.6,OPA,18105.0,1598,63.0,11.5,178.0
5,Opel Vectra 2.0,OPV,19946.0,1998,136.0,13.9,212.0
6,Opel Omega MV6,OPG,,2962,211.0,16.8,238.0
7,Volkswagen Golf GLS 2.0,VWG,16350.0,1984,115.0,11.9,195.0
8,Volkswagen Passat GLX,VWP,28750.0,2771,190.0,13.7,227.0
9,Volkswagen Beetle 2.0L,VWB,20243.0,1984,,9.8,177.0


In [323]:
mynewcar = cars.set_index('carmodel')

In [325]:
# Will give the index
mynewcar.index

Index(['Ford Mondeo LX', 'Ford Galaxy LX', 'Ford Fiesta 1.25i',
       'Ford Ka 1.3 HCS EFi', 'Opel Astra 1.6', 'Opel Vectra 2.0',
       'Opel Omega MV6', 'Volkswagen Golf GLS 2.0', 'Volkswagen Passat GLX',
       'Volkswagen Beetle 2.0L', 'Daewoo Leganza SX', 'Daewoo Cielo GLE',
       'Daewoo Nubira CDS', 'Daewoo Matiz SE', 'Mercedes M-Class ML320',
       'Volvo V70 2.3L', 'BMW 3-series 330i', 'Renault Clio RXE',
       'Renault Megane', 'Renault Scenic Rx4', 'Fiat Punto 75',
       'Fiat Bravo SX-GT', 'Fiat Multipla 1.6 ELX', 'Dacia Nova GTi',
       'Dacia SupeRNova Confort', 'Mercedes C-Class C320',
       'Mercedes CL-Class CL500', 'Mercedes E-Class E320',
       'Mercedes S-Class S500', 'Volvo C70 2.3L', 'BMW 5-series 540i',
       'BMW 7-series 740i'],
      dtype='object', name='carmodel')

In [326]:
cars.head()

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
0,Ford,Ford Mondeo LX,FDM,22170.0,1989,144.0,11.3,
1,Ford,Ford Galaxy LX,FDG,27516.0,2792,201.0,14.8,219.0
2,Ford,Ford Fiesta 1.25i,FDF,,1242,74.0,8.7,167.0
3,Ford,Ford Ka 1.3 HCS EFi,FDK,13085.0,1299,59.0,8.3,155.0
4,Opel,Opel Astra 1.6,OPA,18105.0,1598,63.0,11.5,178.0


In [327]:
# sorting on price
cars.sort_values('price')

Unnamed: 0,comp,carmodel,symbol,price,engine,power,fuelcons,speed
13,Daewoo,Daewoo Matiz SE,DWM,4792.0,796,51.0,7.4,144.0
23,Dacia,Dacia Nova GTi,DAN,5000.0,1557,72.0,9.6,161.0
11,Daewoo,Daewoo Cielo GLE,DWI,6860.0,1498,80.0,10.5,170.0
12,Daewoo,Daewoo Nubira CDS,DWN,9222.0,1598,78.0,11.3,185.0
17,Renault,Renault Clio RXE,RCT,12542.0,1390,98.0,8.8,186.0
3,Ford,Ford Ka 1.3 HCS EFi,FDK,13085.0,1299,59.0,8.3,155.0
20,Fiat,Fiat Punto 75,FTP,14370.0,1242,73.0,7.8,170.0
21,Fiat,Fiat Bravo SX-GT,FTB,15830.0,1581,,10.6,184.0
7,Volkswagen,Volkswagen Golf GLS 2.0,VWG,16350.0,1984,115.0,11.9,195.0
18,Renault,Renault Megane,RTM,17370.0,2965,250.0,9.2,


In [330]:
cars.isnull().sum().sum()

17