# Lecture -1

In [1]:
import pandas as pd
import numpy as np

## Introduction to Pandas Data Structures

### Series

In [2]:
obj= pd.Series([4,7,-8,-3])
obj

0    4
1    7
2   -8
3   -3
dtype: int64

In [3]:
#We will now check the values and indexes
print(obj.values)
print(obj.index) #This will return the range(Start,Stop,Step)

[ 4  7 -8 -3]
RangeIndex(start=0, stop=4, step=1)


In [4]:
#Now we will create another series but this time with out own index values. 
obj2 = pd.Series([5,7,1,0],index=['a','b','c','d'])
obj2

a    5
b    7
c    1
d    0
dtype: int64

In [5]:
print(obj2.values)
print(obj2.index) #This time it wont repeat the range value.

[5 7 1 0]
Index(['a', 'b', 'c', 'd'], dtype='object')


In [6]:
type(obj2.index) 

pandas.core.indexes.base.Index

In [7]:
#Now we will see the elements of the series according to their index
obj2['a']


5

In [8]:
#If we want to see the element at more than one index we need to pass list of those indexes inside a list
#obj2['b','c'] - >if we perform then it will give the error
obj2[['b','d']]

b    7
d    0
dtype: int64

In [9]:
#Now we will try to change the value of the element present at index 'a'
obj2['a'] = 25
obj2

a    25
b     7
c     1
d     0
dtype: int64

In [10]:
#Now we want to check how many elements have value greater than 1
[obj2>1]


[a     True
 b     True
 c    False
 d    False
 dtype: bool]

In [11]:
#if we want to check only those values
obj2[obj2>1]

a    25
b     7
dtype: int64

In [12]:
#We can do some artihmetic operations 
obj2*2

a    50
b    14
c     2
d     0
dtype: int64

In [13]:
obj2-2

a    23
b     5
c    -1
d    -2
dtype: int64

In [14]:
obj2/2

a    12.5
b     3.5
c     0.5
d     0.0
dtype: float64

In [15]:
obj2+1

a    26
b     8
c     2
d     1
dtype: int64

In [16]:
#IF we want to check whether the index is present in the series on not
'b' in obj2

True

In [17]:
'e' in obj2

False

In [18]:
#We will seee another way of creating series
states_data = {'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3 = pd.Series(states_data)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [19]:
#We will create another list of index values and pass the same dictionary and see the difference
states = ['California','Ohio','Oregon','Texas']
obj4 = pd.Series(states_data,index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [20]:
#We will check how many value are null
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [21]:
#We will check how many value are null
pd.isna(obj4)
#Or
#obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [22]:
#We will check how many value donot have null values
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [23]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [24]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [25]:
obj4.name = 'population'
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [27]:
#Setting the name of the index values
obj4.index.name='State'
obj4

State
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [28]:
obj

0    4
1    7
2   -8
3   -3
dtype: int64

In [29]:
#We will now change the name of the index
obj.index = ['Bob','Charle','David','Jeff']
obj

Bob       4
Charle    7
David    -8
Jeff     -3
dtype: int64

### DataFrame

In [30]:
#We will create the dataframe

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [35]:
#We will change the order of columns
pd.DataFrame(data,columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [34]:
#We will make another dataframe with few changes
frame2 = pd.DataFrame(data=data, columns=['year','state','pop','debt'],
                     index=['one','two','three','four','five','six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [36]:
#If we want to access any particular columns
#There are two ways
#First is
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [37]:
#Second is 
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [38]:
#If we want to get access to any index value
frame2.loc['one']

year     2000
state    Ohio
pop       1.5
debt      NaN
Name: one, dtype: object

In [39]:
#To check the names of all the columns
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [40]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [42]:
import numpy as np

In [48]:
frame2['debt']=np.arange(1.,12,2)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,1.0
two,2001,Ohio,1.7,3.0
three,2002,Ohio,3.6,5.0
four,2001,Nevada,2.4,7.0
five,2002,Nevada,2.9,9.0
six,2003,Nevada,3.2,11.0


In [50]:
val = pd.Series([-1.2,-1.5,-1.7],index=['two','four','six'])
frame2['debt']= val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,-1.7


The values for 2,4 and 6 index has changed and for rest it is NaN

In [51]:
#We will create a new column eastern and perform conditonal check whether that record belong to Ohio or not
frame2['eastern'] = frame2.state=='Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,,False
six,2003,Nevada,3.2,-1.7,False


In [52]:
#Now we will delete eastern column
del frame2['eastern']
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,-1.7


In [53]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [54]:
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [55]:
#We will provide a new index value now "2003" 
pd.DataFrame(pop, index=[2001,2002,2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [57]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [59]:
frame3['Ohio'][:-1]

2001    1.7
2002    3.6
Name: Ohio, dtype: float64

In [60]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [64]:
#Provide the name to the columns and index 
frame3.index.name='year'; frame3.columns.name='state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [66]:
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [68]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, -1.7]], dtype=object)

### Index Object

In [71]:
obj = pd.Series(range(3), index=['a','b','c'])
obj

a    0
b    1
c    2
dtype: int64

In [72]:
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [73]:
#We will check the values from index 2 to last
index[1:]

Index(['b', 'c'], dtype='object')

In [75]:
labels= pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [76]:
obj2 = pd.Series([1.5,2,-8],index=labels)
obj2

0    1.5
1    2.0
2   -8.0
dtype: float64

In [77]:
obj2.index is labels

True

In [78]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [79]:
'Ohio' in frame3.columns

True

In [80]:
'state' in frame3.columns

False

In [81]:
2003 in frame3.index

False

In [82]:
dup_labels = pd.Index(['a','b','c'])
dup_labels

Index(['a', 'b', 'c'], dtype='object')

# Essential Functionality

### Reindexing 

In [83]:
obj = pd.Series([4.5,-8.0,9,1.5], index=['d','b','c','a'])
obj

d    4.5
b   -8.0
c    9.0
a    1.5
dtype: float64

In [84]:
obj2 = obj.reindex(['a','b','c','d','e'])
obj2

a    1.5
b   -8.0
c    9.0
d    4.5
e    NaN
dtype: float64

In [85]:
obj3 = pd.Series(['blue','purple','yellow'],index=[0,2,4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [89]:
#Method = ffill tells that fill all the values at every index
obj4 = obj3.reindex(np.arange(7),method='ffill')
obj4

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
6    yellow
dtype: object

In [90]:
frame = pd.DataFrame(np.arange(9).reshape(3,3),index=['a','c','d'],columns=['Ohio','Texas','California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [91]:
#We will reset the index
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [92]:
#We can change the columns also
states = ['Texas','Utah','California']
frame2.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


In [95]:
frame2.loc[['a','b','d'],states] #Instead of states we can write the name of columns also

Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
d,7.0,,8.0


### Dropping Entries from an Axis

In [96]:
obj = pd.Series(np.arange(5.),index=['a','b','c','d','e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [99]:
#We will drop value at index c and save it in new series
new_obj = obj.drop('c') #Since we dont have 2-D here we dont need to specify axis
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [100]:
frame = pd.DataFrame(np.arange(25.).reshape(5,5),index=['Ohio', 'Colorado', 'Utah', 'New York','California'],
                    columns=['one','two','three','four','five'])
frame

Unnamed: 0,one,two,three,four,five
Ohio,0.0,1.0,2.0,3.0,4.0
Colorado,5.0,6.0,7.0,8.0,9.0
Utah,10.0,11.0,12.0,13.0,14.0
New York,15.0,16.0,17.0,18.0,19.0
California,20.0,21.0,22.0,23.0,24.0


In [102]:
frame.drop(['Utah','New York']) #By default axis is 0

Unnamed: 0,one,two,three,four,five
Ohio,0.0,1.0,2.0,3.0,4.0
Colorado,5.0,6.0,7.0,8.0,9.0
California,20.0,21.0,22.0,23.0,24.0


In [103]:
#if we want to remove any column we need to change the axis
frame.drop(['one','four'],axis=1)

Unnamed: 0,two,three,five
Ohio,1.0,2.0,4.0
Colorado,6.0,7.0,9.0
Utah,11.0,12.0,14.0
New York,16.0,17.0,19.0
California,21.0,22.0,24.0


If we want to remove the column permanently we need to use inplace = True

In [105]:
frame.drop(['one','four'],axis=1,inplace=True)

In [106]:
frame

Unnamed: 0,two,three,five
Ohio,1.0,2.0,4.0
Colorado,6.0,7.0,9.0
Utah,11.0,12.0,14.0
New York,16.0,17.0,19.0
California,21.0,22.0,24.0


### Indexing, Selection and Filtering

In [2]:
import numpy as np
obj = pd.Series(np.arange(4.), index=['a','b','c','d'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [3]:
#We can write index number also
obj['b']
#obj[1] 

1.0

In [4]:
#Let us see and compare the result 
obj[1]

1.0

In [5]:
obj[['b','c']]

b    1.0
c    2.0
dtype: float64

In [6]:
obj[1:3]

b    1.0
c    2.0
dtype: float64

In [7]:
obj[[1,2]]

b    1.0
c    2.0
dtype: float64

All three giving same output obj[['b','c']], obj[1:3], obj[[1,2]]

In [8]:
obj['b':'d']

b    1.0
c    2.0
d    3.0
dtype: float64

In [9]:
#Now we will print the subset on the basis of the certain condition
obj[obj>2]

d    3.0
dtype: float64

In [10]:
obj[obj<2]

a    0.0
b    1.0
dtype: float64

In [11]:
obj['b':'d']

b    1.0
c    2.0
d    3.0
dtype: float64

In [12]:
#Now let us try to change the values at index b,c,d
obj['b':'d'] = 8
obj

a    0.0
b    8.0
c    8.0
d    8.0
dtype: float64

In [13]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [14]:
#We will see the dataframe on the basis of some condition
data[data['three']>5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [15]:
data[data<5]

Unnamed: 0,one,two,three,four
Ohio,0.0,1.0,2.0,3.0
Colorado,4.0,,,
Utah,,,,
New York,,,,


We can clearly see the difference between filtering. When we filter on the basis of one of a particular column we have only those records which satisfy that very condition and result are ignored but when we filter for the whole dataframe we get the result from whole dataframe. The values which dont follow that condition will be written as NaN.

In [16]:
#The values which fulfill the given condition will be replaced by 0
data[data<5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


### Selection with loc and iloc

In [17]:
data.loc['Utah']

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [18]:
#accessing one particuar column with two speicific record
data.loc['Utah',['two','three']]

two       9
three    10
Name: Utah, dtype: int64

In [19]:
#accessing one particuar column with three speicific record using index location
data.iloc[2,[3,0,1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

### Arithmetic and Data Alignment

In [20]:
s1 = pd.Series([7.3,5,1.9,-9.1,-8.2],index=['a','b','c','d','e'])
s1

a    7.3
b    5.0
c    1.9
d   -9.1
e   -8.2
dtype: float64

In [21]:
s2 = pd.Series([-7.3,4.6,-3.2,4.3,6.1],index=['a','c','e','f','g'])
s2

a   -7.3
c    4.6
e   -3.2
f    4.3
g    6.1
dtype: float64

In [22]:
s1+s2

a     0.0
b     NaN
c     6.5
d     NaN
e   -11.4
f     NaN
g     NaN
dtype: float64

If we add two series then it will add up where index value matches and put NaN for rest

In [1]:
import numpy as np

In [24]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])

In [25]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [26]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [27]:
df1+df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [28]:
df1 = pd.DataFrame({'A':[1,2]})
df2 = pd.DataFrame({'B':[3,4]})

In [29]:
df1

Unnamed: 0,A
0,1
1,2


In [30]:
df2

Unnamed: 0,B
0,3
1,4


In [31]:
df1-df2

Unnamed: 0,A,B
0,,
1,,


Since any of the value doesn't have same column so the arithmetic operation doesnt work

### Arithmetic methods with fill values

In [32]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                   columns=list('abcde'))

In [33]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [34]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [35]:
df2.loc[1,'b'] = np.nan
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [36]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [37]:
df1+df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [38]:
df1.add(df2,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [39]:
df1.reindex(columns=df2.columns)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,
1,4.0,5.0,6.0,7.0,
2,8.0,9.0,10.0,11.0,


In [40]:
df1.reindex(columns=df2.columns,fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


### Operations between DataFrame and Series

In [41]:
arr = np.arange(12.).reshape(3,4)
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [42]:
arr[0]

array([0., 1., 2., 3.])

In [43]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [44]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [45]:
series = frame.iloc[0]
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [46]:
frame-series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [47]:
series2 = pd.Series(range(3),index=['b','e','f'])
series2

b    0
e    1
f    2
dtype: int64

In [48]:
frame+series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [49]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [50]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [51]:
frame.sub(series3,axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


### Function Application and Mapping

In [52]:
frame = pd.DataFrame(np.random.randn(4,3),columns=list('bde'),
                    index=['Utah','Ohio','Texas','Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.354855,-1.353341,-0.840478
Ohio,-2.153233,-1.043858,-0.212221
Texas,-0.150759,0.282252,-0.492615
Oregon,-1.254415,0.862178,0.278508


In [53]:
f = lambda x: x.max() -x.min()
frame.apply(f,axis='columns')

Utah      1.708196
Ohio      1.941012
Texas     0.774867
Oregon    2.116593
dtype: float64

In [54]:
def f(x):
    return pd.Series([x.min(),x.max()], index=['min','max'])

frame.apply(f, axis=1)

Unnamed: 0,min,max
Utah,-1.353341,0.354855
Ohio,-2.153233,-0.212221
Texas,-0.492615,0.282252
Oregon,-1.254415,0.862178


In [55]:
format =  lambda x: '%.2f' %x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.35,-1.35,-0.84
Ohio,-2.15,-1.04,-0.21
Texas,-0.15,0.28,-0.49
Oregon,-1.25,0.86,0.28


In [56]:
frame

Unnamed: 0,b,d,e
Utah,0.354855,-1.353341,-0.840478
Ohio,-2.153233,-1.043858,-0.212221
Texas,-0.150759,0.282252,-0.492615
Oregon,-1.254415,0.862178,0.278508


In [57]:
frame['e'].map(format)

Utah      -0.84
Ohio      -0.21
Texas     -0.49
Oregon     0.28
Name: e, dtype: object

### Sorting and Ranking

In [58]:
obj = pd.Series(range(4), index=['d','a','c','b'])
obj

d    0
a    1
c    2
b    3
dtype: int64

In [59]:
obj.sort_index()

a    1
b    3
c    2
d    0
dtype: int64

In [60]:
obj2 = pd.Series([5,1,6,2], index=['d','a','c','b'])
obj2

d    5
a    1
c    6
b    2
dtype: int64

In [61]:
obj2.sort_values()

a    1
b    2
d    5
c    6
dtype: int64

In [62]:
frame = pd.DataFrame(np.random.rand(2,4), index=['three','one'],columns=['d','a','b','c'])
frame

Unnamed: 0,d,a,b,c
three,0.893877,0.434588,0.647049,0.362254
one,0.221237,0.56711,0.778791,0.421474


In [63]:
#We will perform sorting operation for the indexes. (By default it will work on index)
frame.sort_index()

Unnamed: 0,d,a,b,c
one,0.221237,0.56711,0.778791,0.421474
three,0.893877,0.434588,0.647049,0.362254


In [64]:
#We will perform sorting operation for the columns using axis =1
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,0.434588,0.647049,0.362254,0.893877
one,0.56711,0.778791,0.421474,0.221237


In [65]:
frame.sort_index(axis=1,ascending=False)

Unnamed: 0,d,c,b,a
three,0.893877,0.362254,0.647049,0.434588
one,0.221237,0.421474,0.778791,0.56711


In [67]:
obj3 = pd.Series([np.nan,5,np.nan,6,7,2,3])
print(obj3)
obj3.sort_values()

0    NaN
1    5.0
2    NaN
3    6.0
4    7.0
5    2.0
6    3.0
dtype: float64


5    2.0
6    3.0
1    5.0
3    6.0
4    7.0
0    NaN
2    NaN
dtype: float64

In [72]:
frame = pd.DataFrame({'b':[4,2,0,3],
                      'a':[3.2,1.0,5.1,'a']})
frame

Unnamed: 0,b,a
0,4,3.2
1,2,1
2,0,5.1
3,3,a


In [73]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,0,5.1
1,2,1
3,3,a
0,4,3.2


In [74]:
frame.sort_values(by='a')

TypeError: '<' not supported between instances of 'str' and 'float'

Sorting can not be done for string.

In [75]:
frame = pd.DataFrame({'a':[3,1,0,2],
                     'b':[2.5,4.6,1.2,2.2]})
frame

Unnamed: 0,a,b
0,3,2.5
1,1,4.6
2,0,1.2
3,2,2.2


In [80]:
frame.sort_values(by='a')

Unnamed: 0,a,b
2,0,1.2
1,1,4.6
3,2,2.2
0,3,2.5


In [79]:
frame.sort_values(by=['a','b'])

Unnamed: 0,a,b
2,0,1.2
1,1,4.6
3,2,2.2
0,3,2.5


In [81]:
frame.sort_values(by=['b','a'])

Unnamed: 0,a,b
2,0,1.2
3,2,2.2
0,3,2.5
1,1,4.6


### Axis Indexes with Duplicate Labels

In [83]:
obj = pd.Series(range(5),index=['a','a','b','b','c'])
obj

a    0
a    1
b    2
b    3
c    4
dtype: int64

In [84]:
obj.index.is_unique

False

### Summarizing and Computing Descriptive Statistics

In [86]:
frame= pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
frame

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [87]:
frame.sum() #By default sum is gievn as column

one    9.25
two   -5.80
dtype: float64

In [88]:
frame.sum(axis=1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [89]:
frame.mean()

one    3.083333
two   -2.900000
dtype: float64

In [90]:
frame.mean(axis=1)

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64

In [91]:
frame.idxmax()

one    b
two    d
dtype: object

In [92]:
frame

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [93]:
frame.cumsum() #Cumulative sum

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [96]:
frame.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
one,3.0,3.083333,3.493685,0.75,1.075,1.4,4.25,7.1
two,2.0,-2.9,2.262742,-4.5,-3.7,-2.9,-2.1,-1.3


### Unique values, Value Counts and Membership

In [97]:
obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [99]:
#We will check all the uniue values
obj.unique()

array(['c', 'a', 'd', 'b'], dtype=object)

In [100]:
objount of every unique value

a    3
c    3
b    2
d    1
dtype: int64

In [104]:
#WE will check the position where b and c are present
mask = obj.isin(['b','c'])
obj[mask]

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [107]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [108]:
data.apply(pd.value_counts)

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [3]:
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20 
np.random.seed(12345)
import matplotlib.pyplot as plt

## Handling Missing Data

In [4]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
#We will replace a value with NaN
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [7]:
string_data

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

## Filtering Out Missing Data

In [2]:
from numpy import nan as NA
data=pd.Series([1,NA,3.5,NA,7])
data

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [9]:
data.isnull()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [10]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
#We will see those values which are not null
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Any row which is having any null value will be removed.

In [14]:
#Now we will see some changes
cleaned2 = data.dropna(how='all')
cleaned2

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


Now we can see that only one row removed which is having all values as null.

In [16]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [17]:
#We will now make a new column
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [18]:
cleaned3 = data.dropna(axis=1, how='all')
cleaned3

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


The new column has been removed as we want every column to be removed where all the values are NaN

In [5]:
df = pd.DataFrame(np.random.rand(7,3))
df

Unnamed: 0,0,1,2
0,0.282733,0.85066,0.967284
1,0.653733,0.602498,0.961921
2,0.035309,0.194055,0.556738
3,0.59748,0.010613,0.070833
4,0.053385,0.47514,0.072113
5,0.195699,0.428052,0.179134
6,0.994347,0.586946,0.936256


In [6]:
# We will replace the values for one of column till 3rd index with NaN
df.iloc[:4,1] = NA
# We will replace the values for one of column till 2nd index with NaN
df.iloc[:2,2] = NA

In [7]:
df

Unnamed: 0,0,1,2
0,0.282733,,
1,0.653733,,
2,0.035309,,0.556738
3,0.59748,,0.070833
4,0.053385,0.47514,0.072113
5,0.195699,0.428052,0.179134
6,0.994347,0.586946,0.936256


In [8]:
cleaned = df.dropna()
cleaned

Unnamed: 0,0,1,2
4,0.053385,0.47514,0.072113
5,0.195699,0.428052,0.179134
6,0.994347,0.586946,0.936256


Since we have not any condition mentioned so it has removed all the records where there was a single null value

In [9]:
cleaned = df.dropna(thresh=2)
cleaned

Unnamed: 0,0,1,2
2,0.035309,,0.556738
3,0.59748,,0.070833
4,0.053385,0.47514,0.072113
5,0.195699,0.428052,0.179134
6,0.994347,0.586946,0.936256


Now it will not remove where there are atleast two non-null values

## Data Transformation

### Removing Duplicates

In [10]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [11]:
#We will check for the duplcicate data
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [12]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two']+['one'],
                     'k2': [1, 1, 2, 3, 3, 4, 4,1]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4
7,one,1


In [14]:
cancel = data.drop_duplicates()

In [15]:
cancel

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [16]:
#We will create a new column 'v1'
data['v1'] = range(8)
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6
7,one,1,7


In [18]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
dtype: bool

Now we don't have any duplicate value.

But wait there are duplicate values in column 'k1'

In [17]:
cancel = data.drop_duplicates(['k1'])
cancel

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [19]:
#We will now drop duplicate value wrt column k2
cancel = data.drop_duplicates(['k2'])
cancel

Unnamed: 0,k1,k2,v1
0,one,1,0
2,one,2,2
3,two,3,3
5,two,4,5


In [22]:
#Now we will remove the duplicate values wrt to 'k1' and 'k2' both
cancel = data.drop_duplicates(['k1','k2'])
cancel

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5


### Transforming data using a Function or Mapping

In [31]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
                              'Pastrami', 'corned beef', 'Bacon',
                              'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [32]:
meal_to_animal = {
    'bacon': 'pig',
  'pulled pork': 'pig',
  'pastrami': 'cow',
  'corned beef': 'cow',
  'honey ham': 'pig',
  'nova lox': 'salmon'
}
meal_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

In [26]:
meal_to_animal.keys()

dict_keys(['bacon', 'pulled pork', 'pastrami', 'corned beef', 'honey ham', 'nova lox'])

In [27]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [28]:
data['animal'] = lowercased.map(meal_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [29]:
meal_to_animal

{'bacon': 'pig',
 'pulled pork': 'pig',
 'pastrami': 'cow',
 'corned beef': 'cow',
 'honey ham': 'pig',
 'nova lox': 'salmon'}

In [33]:
data['food'].map(lambda x: meal_to_animal[x.lower()])
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


### Replacing Value

In [2]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [3]:
data.replace(-999.0,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [4]:
data.replace([-999.0,-1000],np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

In [5]:
data.replace([-999.0,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [7]:
data.replace({-999.0:1})

0       1.0
1       1.0
2       2.0
3       1.0
4   -1000.0
5       3.0
dtype: float64

## Renaming Columns and Index

In [14]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [12]:
transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [13]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


## Discretization and Bining 

## Lecture - 3

#### Querying a Series

In [2]:
# A pandas series can be queried either by the index position or index label. 
# To query numerical location, starting at zero, we use the iloc attribute. To query by the index label we can use loc attribute

students_classes = {'Alice':'Physics',
                   'Jack':'Chemistry',
                   'Molly':'English',
                   'Sam':'History'}
s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [3]:
#If we want to see the forth entry we can use iloc attribute with parameter 3
s.iloc[3]

'History'

In [4]:
#If we want to see what subject Molly is having we will use loc attribute
s.loc['Molly']

'English'

In [5]:
#If we pass an integer value, the operator will behave like iloc automatically
s[3]

'History'

In [6]:
#IF we pass in an object it will behave like loc attribute.
s['Molly']

'English'

In [7]:
#But when we have list of integer as our index values then it becomes very confusing so in those cases it is safer to use
#loc and iloc 

#Let's create another series
class_code = {99:'Physics',
             100:'Chemistry',
             101:'History',
             102:'English'}
s = pd.Series(class_code)
s

99       Physics
100    Chemistry
101      History
102      English
dtype: object

In [8]:
#Now if we try to use s[0] we get a key error, because there is no item in the data having 0 index 

#instead we have to use iloc in this case if we want to access first itme of the index
s.iloc[0]

'Physics'

In [9]:
#Let us make a program where we will create a series of grades and then try to find the average grade
grades= pd.Series([40,20,60,70,90,80])
total = 0
for grade in grades:
    total += grade
print(total/len(grades))                  

60.0


In [10]:
#This is a slow and time consuming process, instead of this we will use functons of numpy library
total = np.sum(grades)
print(total/len(grades))

60.0


In [11]:
#The functions which are inside the numpy libraries are much more quicker than the normal loop

In [12]:
#Another feature of the pandas and numpy is we can apply any operation to every value of the series and this function is
#Know as broadcasting. 
#Let us check our grades series
grades

0    40
1    20
2    60
3    70
4    90
5    80
dtype: int64

In [13]:
#Now let the faculty decided togive 10 extra marks to each student 
grades+=10
grades

0     50
1     30
2     70
3     80
4    100
5     90
dtype: int64

In [21]:
#The broadcasting can be performed with the traditional looping also but that will be a long process.

#Pandas does support iterating through a series much like a dictionary, allowing you to unpack the values easily
for key,value in grades.iteritems():
    print(key)
    print(value)
    print("=======")

0
50
1
30
2
70
3
80
4
100
5
90


In [16]:
# #Now we will reduces 5 marks from every student using iteritems function
# for key,item in grades.iteritems():
#     #We will use .at() for this
    
#     grades.set_values(key,item-5)
# #We will now print our grade series and check the result
# grades

AttributeError: 'Series' object has no attribute 'set_values'

In [17]:
#One important thing that .loc attribute not only modify data in place but also add new data as well.
#If the value we pass in .loc is not present in the index then it will create that index value.Indexes can have mixed data types
s = pd.Series([1,2,5,4,6])
s

0    1
1    2
2    5
3    4
4    6
dtype: int64

In [19]:
#Now let us add some index
s.loc['Maths'] = 102

s

0          1
1          2
2          5
3          4
4          6
Maths    102
dtype: int64

In [20]:
#Till now we have seen data where index values were unique but now we will see data where index values will not be unique

student_class = pd.Series({'Ajay':'English','Rohan':'Physics','Sanjay':'Maths',
                          'Ashish':'Biology'})

student_class

Ajay      English
Rohan     Physics
Sanjay      Maths
Ashish    Biology
dtype: object

In [21]:
#Now we will create another series here where the index will not be unique
arpi_class = pd.Series(['Digital Marketing','Supply Chain','English'],index=['Arpita','Arpita','Arpita'])

arpi_class

Arpita    Digital Marketing
Arpita         Supply Chain
Arpita              English
dtype: object

In [22]:
#Now we will join both the series using append function

overall_class = student_class.append(arpi_class)

overall_class

Ajay                English
Rohan               Physics
Sanjay                Maths
Ashish              Biology
Arpita    Digital Marketing
Arpita         Supply Chain
Arpita              English
dtype: object

In [23]:
#There are a couple of important considerations when using .append. First, Pandas will take the series and try to 
#infer the best data types to use. In this example, everything is a string, so there's no problems here. 
#Second, the append method doesn't actually change the underlying series objects. It instead returns a new 
#series which is made up of the two appended together. And this is actually a common pattern in Pandas. 
#By default, returning a new object instead of modifying one in place.

In [24]:
#Finally, we can see that when we query the appended series for Arpita, we don't get a single value, but a series itself.
overall_class.loc['Arpita']

Arpita    Digital Marketing
Arpita         Supply Chain
Arpita              English
dtype: object

## DataFrame Data Structure

In [25]:
## We will create three individual records
record1 = pd.Series({'Name':'Arpita',
                    'Specialisation':'Digital Marketing',
                    'CGPA':9.1})
record2 = pd.Series({'Name':'Dhananjay',
                    'Specialisation':'Data Science',
                    'CGPA':8.5})
record3 = pd.Series({'Name':'Akshay',
                    'Specialisation':'Film',
                    'CGPA':7.3})

#we will print anyone 
record1

Name                         Arpita
Specialisation    Digital Marketing
CGPA                            9.1
dtype: object

In [28]:
#Now we will create a data frame from these series
df = pd.DataFrame([record1,record2,record3], index=['school1','school2','school3'])
df

Unnamed: 0,Name,Specialisation,CGPA
school1,Arpita,Digital Marketing,9.1
school2,Dhananjay,Data Science,8.5
school3,Akshay,Film,7.3


In [2]:
#An alternative way to create a dataframe is we can use a list of dictionaries where each dictionary represents a row or record

students = [{'Name':'Arpita','Specialisation':'Digital Marketing','CGPA':9.1},
           {'Name':'Dhananjay','Specialisation':'Data Science','CGPA':8.5},
           {'Name':'Akshay','Specialisation':'Film','CGPA':7.3}]

#Now we will pass this list of dictionary into the dataframe

df = pd.DataFrame(students,index=['school1','school2','school3'])
df

Unnamed: 0,Name,Specialisation,CGPA
school1,Arpita,Digital Marketing,9.1
school2,Dhananjay,Data Science,8.5
school3,Akshay,Film,7.3


In [3]:
#Similar to series, we can extract the data from a dataframe using loc and iloc 

#If we want to get the data from school2 we will just pass school2 in loc attribiute

#df.iloc[1]
#or

df.loc['school2']
#This will return a series

Name                 Dhananjay
Specialisation    Data Science
CGPA                       8.5
Name: school2, dtype: object

In [5]:
#We will create the data frame again but this time we will take two index values similar
df=pd.DataFrame(students, index=['school1','school1','school2'])
df

Unnamed: 0,Name,Specialisation,CGPA
school1,Arpita,Digital Marketing,9.1
school1,Dhananjay,Data Science,8.5
school2,Akshay,Film,7.3


In [6]:
#Now if we use .loc attribute and pass school1 then we will get a dataframe instead of series
df.loc['school1']

Unnamed: 0,Name,Specialisation,CGPA
school1,Arpita,Digital Marketing,9.1
school1,Dhananjay,Data Science,8.5


In [11]:
#Now if we want to get the names of the students from the school1 than we can pass two values inside .loc 
#one being the row and other for the column

#There are two ways of doing this one will provide us output in form of dataframe and other will provide in the form of series

#df.loc['school1','Name'] This will return series

df.loc[['school1'],['Name']] #This will return the output as dataframe

Unnamed: 0,Name
school1,Arpita
school1,Dhananjay


In [14]:
#So what would we do if we want it to select a single column row? Well, there's a few mechanisms. 
#First, we could transpose the matrix. This pivots all of the rows into columns and all of the columns into rows,
# and it's done with the T attribute.

df.T
 

Unnamed: 0,school1,school1.1,school2
Name,Arpita,Dhananjay,Akshay
Specialisation,Digital Marketing,Data Science,Film
CGPA,9.1,8.5,7.3


In [15]:
#Now if we want to access the names of the students 

df.T.loc['Name']

school1       Arpita
school1    Dhananjay
school2       Akshay
Name: Name, dtype: object

In [16]:
#However, since iloc and loc are used for row selection, Panda reserves the indexing operator directly on 
#the DataFrame for column selection. In a Panda's DataFrame, columns always have a name. So this selection 
#is always label based, and it's not as confusing as it was when using the square bracket operator on the series objects.

df['Name']

school1       Arpita
school1    Dhananjay
school2       Akshay
Name: Name, dtype: object

In [17]:
df

Unnamed: 0,Name,Specialisation,CGPA
school1,Arpita,Digital Marketing,9.1
school1,Dhananjay,Data Science,8.5
school2,Akshay,Film,7.3


In [18]:
# As we saw.loc does row selection, and it can take two parameters, the row index, and the list of column names. 
#The.loc attribute also supports slicing. If we wanted to select all rows, we can use a colon to indicate a full 
#slice from beginning to end. This is just like slicing characters in a list in Python. Then we can add the column
#name as the second parameter as a string. If we wanted to include multiple columns, we could do so in a list, and 
#pandas will bring back only the columns that we've asked for.

df.loc[:,['Name','CGPA']]

Unnamed: 0,Name,CGPA
school1,Arpita,9.1
school1,Dhananjay,8.5
school2,Akshay,7.3


In [19]:
#Let's talk about droping the data. It is easy to delete data in series and DataFrame and we can use drop function to do so
#The function by default takes the a single arguement that is row label or index. The drop function does not change the 
#Dataframe by default but it just sends a copy of the dataframe with the removed rows

df.drop('school1')

Unnamed: 0,Name,Specialisation,CGPA
school2,Akshay,Film,7.3


In [20]:
#Drop has just returned the copy of new dataframe it has not chaned the original dataframe
df

Unnamed: 0,Name,Specialisation,CGPA
school1,Arpita,Digital Marketing,9.1
school1,Dhananjay,Data Science,8.5
school2,Akshay,Film,7.3


In [21]:
#THere are two features of drop first is inplace which is if set to TRUE will change the dataframe permanently, instead 
#of creating a copy, Second option is axis- by defualt its value is 0 which means rows but we can change it to columns (1)

#we will make a copy of the dataframe
df_copy = df.copy()

#Now we will remove Name Column
df_copy.drop('Name',axis=1,inplace=True)
df_copy

Unnamed: 0,Specialisation,CGPA
school1,Digital Marketing,9.1
school1,Data Science,8.5
school2,Film,7.3


In [23]:
#There is another way to drop any row or column, that is del function. Using del function is easy, it will make the changes
#Permanently in the dataframe

del df_copy['CGPA']

In [24]:
df_copy

Unnamed: 0,Specialisation
school1,Digital Marketing
school1,Data Science
school2,Film


In [25]:
#Finally, adding a new column to the DataFrame is as easy as signing it to some value using the indexing operator. 
#For instance, if we wanted to add a class ranking column with default value of None, we could do so by using the 
#assignment operator after the square brackets. This broadcasts the default value to the new column immediately. 

df['Class_Ranking'] = None
df

Unnamed: 0,Name,Specialisation,CGPA,Class_Ranking
school1,Arpita,Digital Marketing,9.1,
school1,Dhananjay,Data Science,8.5,
school2,Akshay,Film,7.3,


## DataFrame Indexing and Loading

In [2]:
#We will look at the csv file now
!cat datasets/Admission_Predict.csv

Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR ,CGPA,Research,Chance of Admit 
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4,4.5,8.87,1,0.76
3,316,104,3,3,3.5,8,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2,3,8.21,0,0.65
6,330,115,5,4.5,3,9.34,1,0.9
7,321,109,3,3,4,8.2,1,0.75
8,308,101,2,3,4,7.9,0,0.68
9,302,102,1,2,1.5,8,0,0.5
10,323,108,3,3.5,3,8.6,0,0.45
11,325,106,3,3.5,4,8.4,1,0.52
12,327,111,4,4,4.5,9,1,0.84
13,328,112,4,4,4.5,9.1,1,0.78
14,307,109,3,4,3,8,1,0.62
15,311,104,3,3.5,2,8.2,1,0.61
16,314,105,3,3.5,2.5,8.3,0,0.54
17,317,107,3,4,3,8.7,0,0.66
18,319,106,3,4,3,8,1,0.65
19,318,110,3,4,3,8.8,0,0.63
20,303,102,3,3.5,3,8.5,0,0.62
21,312,107,3,3,2,7.9,1,0.64
22,325,114,4,3,2,8.4,0,0.7
23,328,116,5,5,5,9.5,1,0.94
24,334,119,5,5,4.5,9.7,1,0.95
25,336,119,5,4,3.5,9.8,1,0.97
26,340,120,5,4.5,4.5,9.6,1,0.94
27,322,109,5,4.5,3.5,8.8,0,0.76
28,298,98,2,1.5,2.5,7.5,1,0.44
29,295,93,1,2,2,7.2,0,0.46
30,310,99

In [27]:
#Now we will read the csv file and convet in into datafram

df = pd.read_csv('datasets/Admission_Predict.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [29]:
#The serial number of the students start with 1 and index starts with 0, but we can set the Serial number as index

df= pd.read_csv('datasets/Admission_Predict.csv',index_col='Serial No.')

#or

#df= pd.read_csv('datasets/Admission_Predict.csv',index_col=0)

df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [31]:
#We will check the columns names first
df.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'CGPA',
       'Research', 'Chance of Admit '],
      dtype='object')

In [32]:
#If we look carefully we can see there is a space in LOR. So if we want to rename this column, we have two options

In [35]:
#Now we will rename the names of two columns "LOR" and "SOP". For this we will use rename function of pandas
#It takes columns as a arguments and inside this arguements we will pass a dictionary with key as old column names and values
#as the new names

df_copy = df.copy()

#The first way is to manually write that name and run this
df_copy.rename(columns={'SOP':'Statement of Purpose','LOR ':'Letter of Recommendation'}, inplace=True)
df_copy

#But this way is messy if we have two or three spaces or if is there in more than one columns, instead of this
#we will pass a function and clean this mess. We will use .strip() function which clears all the white spaces

df_copy2 = df.copy()
df_copy2.rename(mapper = str.strip,axis='columns',inplace=True)

df_copy2.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA',
       'Research', 'Chance of Admit'],
      dtype='object')

In [36]:
#Now we will try to change all the column names into lower letter
cols = list(df.columns)

#We will use a list comprehension now to convert all the values into the lower case
cols = [x.lower().strip() for x in cols]

df.columns = cols
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [37]:
df.columns

Index(['gre score', 'toefl score', 'university rating', 'sop', 'lor', 'cgpa',
       'research', 'chance of admit'],
      dtype='object')

## Querying DataFrame

Boolean mask is an array which can be thought of as a one dimension like a series, or two-dimensions like a DataFrame, where each of the values of the array are either true or false. This array is essentially overlaid on top of the other data structure that we're querying, and any cell aligned with the true value will be admitted into our final result, and any cell aligned with the false value will not.

In [3]:
df = pd.read_csv('datasets/Admission_Predict.csv',index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [4]:
#First we will fix the names of the columns 
df.columns = [x.lower().strip() for x in df.columns]
df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [6]:
#Let us say we are interested in seeing only those serial number who are having greater than 0.7 chance of admit

admit_mask = df['chance of admit']>0.7

#Now we will see those serial number
admit_mask

Serial No.
1       True
2       True
3       True
4       True
5      False
       ...  
396     True
397     True
398     True
399    False
400     True
Name: chance of admit, Length: 400, dtype: bool

In [7]:
#So now what can we do with the boolean mask once we have created it? 
#We can lay it off on the top of the data that we do not want, which is represented by all of the False values. We do this
#by using the .where() function on the original dataframe

df.where(admit_mask)

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
3,316.0,104.0,3.0,3.0,3.5,8.00,1.0,0.72
4,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.80
5,,,,,,,,
6,330.0,115.0,5.0,4.5,3.0,9.34,1.0,0.90
7,321.0,109.0,3.0,3.0,4.0,8.20,1.0,0.75
8,,,,,,,,
9,,,,,,,,
10,,,,,,,,


In [8]:
#The records where the condition did not met are represented as NaN but they have not dropped those records. 
#If we want we can drop those records as well using .dropna()

df.where(admit_mask).dropna()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
3,316.0,104.0,3.0,3.0,3.5,8.00,1.0,0.72
4,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.80
6,330.0,115.0,5.0,4.5,3.0,9.34,1.0,0.90
7,321.0,109.0,3.0,3.0,4.0,8.20,1.0,0.75
12,327.0,111.0,4.0,4.0,4.5,9.00,1.0,0.84
13,328.0,112.0,4.0,4.0,4.5,9.10,1.0,0.78
23,328.0,116.0,5.0,5.0,5.0,9.50,1.0,0.94
24,334.0,119.0,5.0,5.0,4.5,9.70,1.0,0.95


In [9]:
#This is a great functionality but where and dropna are not used together much instead of this 
#we can use
df[df['chance of admit']>0.7]

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
6,330,115,5,4.5,3.0,9.34,1,0.90
7,321,109,3,3.0,4.0,8.20,1,0.75
12,327,111,4,4.0,4.5,9.00,1,0.84
13,328,112,4,4.0,4.5,9.10,1,0.78
23,328,116,5,5.0,5.0,9.50,1,0.94
24,334,119,5,5.0,4.5,9.70,1,0.95


In [10]:
#let's talk about combining multiple Boolean masks, such as multiple criteria for including. 
#In bit masking and other places in computer science, this is done with "and". If both masks must be True, 
#for a True value, and to be in the final mask, or "or" if only one needs to be True.

#If we want to compare multiple conditions now then we can use & or | operator

#Let we want to see all those reocords where they have chance of admit between 0.7 to 0.9

df[(df['chance of admit']>0.7)&(df['chance of admit']<0.9)]

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
7,321,109,3,3.0,4.0,8.20,1,0.75
12,327,111,4,4.0,4.5,9.00,1,0.84
13,328,112,4,4.0,4.5,9.10,1,0.78
27,322,109,5,4.5,3.5,8.80,0,0.76
32,327,103,3,4.0,4.0,8.30,1,0.74
36,320,110,5,5.0,5.0,9.20,1,0.88
44,332,117,4,4.5,4.0,9.10,0,0.87


In [15]:
#Instead of using these symbols we can use inbuilt functions of pandas also

df[(df['chance of admit'].gt(0.7)) & (df['chance of admit'].lt(0.9))]

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
7,321,109,3,3.0,4.0,8.20,1,0.75
12,327,111,4,4.0,4.5,9.00,1,0.84
13,328,112,4,4.0,4.5,9.10,1,0.78
27,322,109,5,4.5,3.5,8.80,0,0.76
32,327,103,3,4.0,4.0,8.30,1,0.74
36,320,110,5,5.0,5.0,9.20,1,0.88
44,332,117,4,4.5,4.0,9.10,0,0.87


In [16]:
#There is one more way to look into this
df[df['chance of admit'].gt(0.7).lt(0.9)]

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,314,103,2,2.0,3.0,8.21,0,0.65
8,308,101,2,3.0,4.0,7.90,0,0.68
9,302,102,1,2.0,1.5,8.00,0,0.50
10,323,108,3,3.5,3.0,8.60,0,0.45
11,325,106,3,3.5,4.0,8.40,1,0.52
14,307,109,3,4.0,3.0,8.00,1,0.62
15,311,104,3,3.5,2.0,8.20,1,0.61
16,314,105,3,3.5,2.5,8.30,0,0.54
17,317,107,3,4.0,3.0,8.70,0,0.66
18,319,106,3,4.0,3.0,8.00,1,0.65


## Missing Value

For instance, if you're running a survey and a respondent didn't answer a question, the missing value is actually an omission. This kind of missing data is called "missing at random" if there are other variables that might be used to predict the variable which is missing.

If there's no relationship to other variables, then we call this data "missing completely at random". 

There is one more type of missing value, that is "Not missing at random" in this type there will not be a specific reason for missing value. E.g. if students skipped a question in a questionnaire where they were asked to tell whether or not they used drugs because they feared that they would be expelled from school.

In [2]:
#The Pandas read csv function has a parameter called na values that allows us to specify the format of missing values. 
#It allows scalar string lists or dictionaries to be used.


df = pd.read_csv('datasets/class_grades.csv')
df.head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


In [3]:
#We can use a function .isnull() to create a boolean mask of the whole dataframe.
mask = df.isnull()
mask.head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,True,False,False
3,False,True,True,False,False,False
4,False,False,False,False,False,False


In [4]:
#Another useful operations is to be able to drop all of those rows which can be done with dropna() function.
df.dropna().head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0


In [5]:
#Another way to deal with missing data is to fill the missing value, pandas come with a very handly function is. fillna()
#It takes a number of parameters, we can pass a single number to fill all the missing values.

#Let's say in this dataframe we want to replace all the null values with 0.

df.fillna(0,inplace=True)
df.head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,0.0,63.15,48.89
3,7,0.0,0.0,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


In [7]:
#We will use another data set

df = pd.read_csv('datasets/log.csv')
df

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,


In [8]:
#In this data the first column is a timestamp in the Unix epoch format. The next column is the username followed by a 
#webpage they're visiting and the video that they're playing. Each row of the DataFrame has a playback position. 
#We can see that as the playback position increases by one, the timestamp increases by about 30 seconds, except for user ball.

#next up is the method parameter(). The two common fill values are ffill and bfill. Ffill is for forward filling and 
#it updates an na value for a particular cell with the value from the previous row. bfills for backward filling which 
#is the opposite of that fill. It fills the missing values with the next valid value. It's important to note that your 
#data needs to be sorted in order for this to have the effect you might want.

#In this dataset we wil  sort out our data wrt to timestamp
df = df.set_index('time')
df = df.sort_index()
df.head()

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,


In [10]:
#So if we have a look closely to our data there are different users which share same index, so we need to reset index
#and use some multi-level indexing or time and users together
df = df.reset_index()
df = df.set_index(['time','user'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,
1469974514,cheryl,intro.html,8,,
1469974524,sue,advanced.html,25,,
1469974544,cheryl,intro.html,9,,
1469974554,sue,advanced.html,26,,
1469974574,cheryl,intro.html,10,,


In [11]:
#Now we have our index and it is sorted we will fill the null values by method ffill

df = df.fillna(method='ffill')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,False,10.0
1469974454,sue,advanced.html,24,False,10.0
1469974484,cheryl,intro.html,7,False,10.0


In [13]:
#We can also do customized fill-in to replace the values with the replace() function. It allows repalcement from several
#approaches: value-to-value, list, dictionary, rergex. 

df = pd.DataFrame({'A':[1,2,3,4,5],
                  'B':[5,6,2,1,5],
                  'C':['a','b','c','d','e']})
df

Unnamed: 0,A,B,C
0,1,5,a
1,2,6,b
2,3,2,c
3,4,1,d
4,5,5,e


In [14]:
#We will repalce 1's with 100 by using value-to-value approach
df.replace(1,100)

Unnamed: 0,A,B,C
0,100,5,a
1,2,6,b
2,3,2,c
3,4,100,d
4,5,5,e


In [15]:
#Now let us replace more than one value. For this we will use list approach, we will repalce 1 by 100 and 5 by 500

df.replace([1,5],[100,500])

Unnamed: 0,A,B,C
0,100,500,a
1,2,6,b
2,3,2,c
3,4,100,d
4,500,500,e


In [16]:
#Now we will see regex
df = pd.read_csv('datasets/log.csv')
df.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


In [None]:
#Let we want to replace all the values in the video column with the webpage
df.replace(to_replace='.*')