 # Series Data structures

In [1]:
import pandas as pd

## Series from list

In [2]:
students = ['Alice','Jack','Molly']
pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [3]:
marks = [234,556,613]
pd.Series(marks)

0    234
1    556
2    613
dtype: int64

In [4]:
# dtype is int64

In [5]:
students = ['Alice','Jack',None]
pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [6]:
marks = [234,556,None]
pd.Series(marks)

# Dtype to floating point number
# NaN -> Floating point number
# so, it went ahead and converted the ints to floats

0    234.0
1    556.0
2      NaN
dtype: float64

In [8]:
import numpy as np
np.nan == None

False

In [9]:
np.nan == np.nan

False

In [10]:
np.isnan(np.nan)

True

## Series from dictionary

In [284]:
student_scores = {'Alice':'Physics',
                 'Jack':'Chemistry',
                  'Molly':'English'}
s = pd.Series(student_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [285]:
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [286]:
# When creating a series using a dictionry and you provide wrong index, it will put NaN in the value

s = pd.Series(student_scores, index=['Alice','Jack','Sam'])
s

Alice      Physics
Jack     Chemistry
Sam            NaN
dtype: object

In [287]:
pd.isnull(s)

Alice    False
Jack     False
Sam       True
dtype: bool

In [288]:
import math
math.isnan(s['Sam'])

True

In [289]:
s['Sam'] == None

False

## Series from Tuple

In [15]:
students = [("Alice","Brown"),("Jack","white"),("Molly","blue")]

In [16]:
pd.Series(students)

0    (Alice, Brown)
1     (Jack, white)
2     (Molly, blue)
dtype: object

## Explicit index for a list

In [17]:
s = pd.Series(['Physics','Chemistry','English'],index=['Alice','Jack','Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

# Querying a series

In [None]:
# index posiiton or index label

# numeric - iloc
# label - loc



In [22]:
student_classes = {'Alice':'Physics',
                 'Jack':'Chemistry',
                  'Molly':'English',
                  'Sam':'Computer'}
s = pd.Series(student_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam       Computer
dtype: object

In [23]:
s.iloc[3]

'Computer'

In [24]:
s.loc['Molly']

'English'

In [25]:
# indexing operator works on context
# if you pass numeric, it works as iloc
s[3]

'Computer'

In [26]:
# if you pass object "Molly", it works as a loc
s['Molly']

'English'

In [29]:
# what if index is a list of integers -> Confuses Pandas
class_code = {99:'Physics',100:'Chemistry',101:'English',102:'Computer'}
s = pd.Series(class_code)
s

99       Physics
100    Chemistry
101      English
102     Computer
dtype: object

In [30]:
# s[0] results in error here
s.iloc[0] # but this works

'Physics'

# Manipulation

In [33]:
# average grade

grades = pd.Series([90,80,70,60])
total = 0
for grade in grades:
    total+=grade
print(total/len(grades))

75.0


In [None]:
# the above method is slow
# modern computers can do tasks simultaneously
# Pandas and numpy support a method of computation called vectorization

In [34]:
total = np.sum(grades)
print(total/len(grades))

75.0


In [36]:
# let's check if it's faster

numbers = pd.Series(np.random.randint(0,1000,10000))

In [37]:
numbers.head()

0    655
1    852
2    525
3     18
4    507
dtype: int64

In [42]:
%%timeit -n 100
total = 0
for number in numbers:
    total+=grade
total/len(numbers)

1.3 ms ± 96.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [43]:
%%timeit -n 100
total = 0
total = np.sum(numbers)
total/len(numbers)

73.7 µs ± 11.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## .loc to add new data

In [45]:
# Indices have mixed types
s = pd.Series([1,2,3])
s.loc['History'] = 102
s

0            1
1            2
2            3
History    102
dtype: int64

## Index not unique

In [50]:
student_classes = pd.Series({'Alice':'Physics',
                 'Jack':'Chemistry',
                  'Molly':'English',
                 'Sam':'Computer'})
student_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam       Computer
dtype: object

In [51]:
kelly_classes = pd.Series(['Philosophy','Arts','Math'],index = ['Kelly','Kelly','Kelly'])

In [52]:
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [53]:
all_students = student_classes.append(kelly_classes)
all_students

Alice       Physics
Jack      Chemistry
Molly       English
Sam        Computer
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [54]:
# append function doesn't actually change the underlying series but returns a new series 
# common pattern in pandas
student_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam       Computer
dtype: object

In [55]:
all_students.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

# DataFrame

In [56]:
record1 = pd.Series({'Name':'Alice', 'Class':'Physics', 'Score':85})
record2 = pd.Series({'Name':'Jack', 'Class':'Chemistru', 'Score':82})
record3 = pd.Series({'Name':'Helen', 'Class':'Biology', 'Score':90})

In [58]:
# Df is indexed
df = pd.DataFrame([record1,record2,record3],
                 index = ['school1','school2','school3'])
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistru,82
school3,Helen,Biology,90


In [60]:
students = [{'Name':'Alice', 'Class':'Physics', 'Score':85},
           {'Name':'Jack', 'Class':'Chemistru', 'Score':82},
           {'Name':'Helen', 'Class':'Biology', 'Score':90}]

In [61]:
df = pd.DataFrame(students,
                 index = ['school1','school2','school3'])
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistru,82
school3,Helen,Biology,90


## loc iloc attributes

In [65]:
# loc - if we pass single value, it returns the series if there's only one row to return

df.loc['school2']
# name of the series is returned as the index value

Name          Jack
Class    Chemistru
Score           82
Name: school2, dtype: object

In [66]:
type(df.loc['school2'])

pandas.core.series.Series

In [69]:
# returns second series in the df
df.iloc[1]

Name          Jack
Class    Chemistru
Score           82
Name: school2, dtype: object

In [72]:
# let's add one more series
newstudent = pd.DataFrame({'Name':'Alison', 'Class':'Computer', 'Score':99},index=['school1'])
newstudent

Unnamed: 0,Name,Class,Score
school1,Alison,Computer,99


In [73]:
dfnew = df.append(newstudent)
dfnew

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistru,82
school3,Helen,Biology,90
school1,Alison,Computer,99


In [76]:
dfnew.loc['school1']

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school1,Alison,Computer,99


In [78]:
type(dfnew.loc['school1'])
# when there are more than two rows for a query, it returns a dataframe

pandas.core.frame.DataFrame

### loc attribute allows to access data based on multiple axes

In [89]:
# Returns series
dfnew.loc['school1','Name']

school1     Alice
school1    Alison
Name: Name, dtype: object

In [90]:
# returns string
df.loc['school1','Name']

'Alice'

In [104]:
df.iloc[1]

Name          Jack
Class    Chemistru
Score           82
Name: school2, dtype: object

# Select single column?

In [95]:
# transpose
df.T

Unnamed: 0,school1,school2,school3
Name,Alice,Jack,Helen
Class,Physics,Chemistru,Biology
Score,85,82,90


In [96]:
df.T.loc['Name']

school1    Alice
school2     Jack
school3    Helen
Name: Name, dtype: object

In [99]:
# loc and iloc are for row use
# for dataframe, panda reserves the indexing operator for column selection

In [100]:
dfnew

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistru,82
school3,Helen,Biology,90
school1,Alison,Computer,99


In [103]:
df['Name']
# df.loc['Name'] will give error, since no row having label 'Name'

school1    Alice
school2     Jack
school3    Helen
Name: Name, dtype: object

In [108]:
dfnew.loc['school1',['Name','Class']]

Unnamed: 0,Name,Class
school1,Alice,Physics
school1,Alison,Computer


### if we want to select all rows, we can use colon : (just like string slicing)

In [109]:
df.loc[:,['Name']]

Unnamed: 0,Name
school1,Alice
school2,Jack
school3,Helen


## Drop

In [111]:
dfnew.drop('school1')

Unnamed: 0,Name,Class,Score
school2,Jack,Chemistru,82
school3,Helen,Biology,90


In [112]:
# didn't change the actual dfnew
dfnew

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistru,82
school3,Helen,Biology,90
school1,Alison,Computer,99


In [113]:
# two optional parameters -
# inplace - dataframe will be updated, instead of copy being returned
# Axis - which should be dropped, by default it's 0 -> row axes. If we want to drop column -> 1

In [119]:
dfx = dfnew.copy()

In [120]:
dfx.drop("Name", inplace=True,axis=1)
dfx

Unnamed: 0,Class,Score
school1,Physics,85
school2,Chemistru,82
school3,Biology,90
school1,Computer,99


In [121]:
del dfx['Class']
dfx

Unnamed: 0,Score
school1,85
school2,82
school3,90
school1,99


## add column

In [124]:
# use indexing 
dfnew['ClassRank'] = None
dfnew

Unnamed: 0,Name,Class,Score,ClassRank
school1,Alice,Physics,85,
school2,Jack,Chemistru,82,
school3,Helen,Biology,90,
school1,Alison,Computer,99,


# DF indexing and loading

In [129]:
# !cat Admission_Predict.csv
# Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR ,CGPA,Research,Chance of Admit 

In [130]:
df = pd.read_csv('Admission_Predict.csv')
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [133]:
# now it treats serial no from the dataset as the index
df = pd.read_csv('Admission_Predict.csv',index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [143]:
new_df = df.rename(columns={'GRE Score':'GRE','TOEFL Score':'TOEFL','University Rating':'Uni Rate',
                           'SOP':'SOP','LOR':'LOR','CGPA':'CGPA','Research':'Research','Chance of Admit ':'Admit Prediction'
                           })

In [144]:
new_df.head()

Unnamed: 0_level_0,GRE,TOEFL,Uni Rate,SOP,LOR,CGPA,Research,Admit Prediction
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [145]:
new_df.columns

Index(['GRE', 'TOEFL', 'Uni Rate', 'SOP', 'LOR ', 'CGPA', 'Research',
       'Admit Prediction'],
      dtype='object')

In [147]:
# we can also clean the columns by passing a function
new_df = new_df.rename(mapper=str.strip, axis='columns')
new_df.columns

# notice LOR

Index(['GRE', 'TOEFL', 'Uni Rate', 'SOP', 'LOR', 'CGPA', 'Research',
       'Admit Prediction'],
      dtype='object')

In [148]:
cols = list(df.columns)

In [149]:
cols = [x.lower().strip() for x in cols]
df.columns = cols

In [150]:
df.columns

Index(['gre score', 'toefl score', 'university rating', 'sop', 'lor', 'cgpa',
       'research', 'chance of admit'],
      dtype='object')

# Querying a df

## Boolean masking

In [161]:
df = pd.read_csv('Admission_Predict.csv',index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [162]:
cols = [x.lower().strip() for x in df.columns]
df.columns = cols

df.head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [163]:
admit_mask = df['chance of admit'] > 0.7
admit_mask.head()

Serial No.
1     True
2     True
3     True
4     True
5    False
Name: chance of admit, dtype: bool

### Now we can lay it on top of data, to hide unwanted data

In [164]:
df.where(admit_mask).head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
3,316.0,104.0,3.0,3.0,3.5,8.0,1.0,0.72
4,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.8
5,,,,,,,,


In [165]:
df.where(admit_mask).dropna().head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
3,316.0,104.0,3.0,3.0,3.5,8.0,1.0,0.72
4,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.8
6,330.0,115.0,5.0,4.5,3.0,9.34,1.0,0.9


In [166]:
df[df['chance of admit'] > 0.7].head()

Unnamed: 0_level_0,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
6,330,115,5,4.5,3.0,9.34,1,0.9


## Indexing

In [167]:
df['gre score'].head()

Serial No.
1    337
2    324
3    316
4    322
5    314
Name: gre score, dtype: int64

In [169]:
df[['gre score','lor']].head()

Unnamed: 0_level_0,gre score,lor
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,337,4.5
2,324,4.5
3,316,3.5
4,322,2.5
5,314,3.0


## Combining multiple bool masks

In [174]:
# Paranthesis are necessary
((df['gre score'] > 0.7) & (df['gre score'] < 0.9)).head()

Serial No.
1    False
2    False
3    False
4    False
5    False
Name: gre score, dtype: bool

In [177]:
# Another way
(df['gre score'].gt(0.7) & df['gre score'].lt(0.9)).head()

Serial No.
1    False
2    False
3    False
4    False
5    False
Name: gre score, dtype: bool

# Indexing

In [178]:
df = pd.read_csv('Admission_Predict.csv', index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [180]:
# let's set index to chance of admit instead of Serial no.
# But preserve the current index
df['Serial Number'] = df.index
df = df.set_index('Chance of Admit ')
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Serial Number
Chance of Admit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.92,337,118,4,4.5,4.5,9.65,1,1
0.76,324,107,4,4.0,4.5,8.87,1,2
0.72,316,104,3,3.0,3.5,8.0,1,3
0.8,322,110,3,3.5,2.5,8.67,1,4
0.65,314,103,2,2.0,3.0,8.21,0,5


In [181]:
df = df.reset_index()
df.head()

Unnamed: 0,Chance of Admit,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Serial Number
0,0.92,337,118,4,4.5,4.5,9.65,1,1
1,0.76,324,107,4,4.0,4.5,8.87,1,2
2,0.72,316,104,3,3.0,3.5,8.0,1,3
3,0.8,322,110,3,3.5,2.5,8.67,1,4
4,0.65,314,103,2,2.0,3.0,8.21,0,5


## Multi-level indxing
- Composite key in relational db
- set index with a list of columns

In [183]:
df = pd.read_csv('census.csv')
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [184]:
# There are two summarized levels - summary data for the whole country & summary data for each state
df['SUMLEV'].unique()

array([40, 50])

In [186]:
# 40 and 50
# Let's exclude all of rows at state level and just keep county data
df = df[df['SUMLEV'] == 50]
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
5,50,3,6,1,9,Alabama,Blount County,57322,57322,57373,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [188]:
columns_to_keep = ['STNAME','CTYNAME','BIRTHS2010','BIRTHS2011','BIRTHS2012','BIRTHS2013','BIRTHS2014','BIRTHS2015',
                  'POPESTIMATE2012','POPESTIMATE2013','POPESTIMATE2014','POPESTIMATE2015',]
df = df[columns_to_keep]
df.head()

Unnamed: 0,STNAME,CTYNAME,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
1,Alabama,Autauga County,151,636,615,574,623,600,55175,55038,55290,55347
2,Alabama,Baldwin County,517,2187,2092,2160,2186,2240,190396,195126,199713,203709
3,Alabama,Barbour County,70,335,300,283,260,269,27159,26973,26815,26489
4,Alabama,Bibb County,44,266,245,259,247,253,22642,22512,22549,22583
5,Alabama,Blount County,183,744,710,646,618,603,57776,57734,57658,57673


In [189]:
# The data above has two indexes -> STNAME and CTYNAME

In [191]:
df = df.set_index(['STNAME','CTYNAME'])
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Alabama,Autauga County,151,636,615,574,623,600,55175,55038,55290,55347
Alabama,Baldwin County,517,2187,2092,2160,2186,2240,190396,195126,199713,203709
Alabama,Barbour County,70,335,300,283,260,269,27159,26973,26815,26489
Alabama,Bibb County,44,266,245,259,247,253,22642,22512,22549,22583
Alabama,Blount County,183,744,710,646,618,603,57776,57734,57658,57673


## Query this data

In [195]:
df.loc['Michigan','Washtenaw County']
# type(df.loc['Michigan','Washtenaw County'])

BIRTHS2010            977
BIRTHS2011           3826
BIRTHS2012           3780
BIRTHS2013           3662
BIRTHS2014           3683
BIRTHS2015           3709
POPESTIMATE2012    351213
POPESTIMATE2013    354289
POPESTIMATE2014    357029
POPESTIMATE2015    358880
Name: (Michigan, Washtenaw County), dtype: int64

In [196]:
# Comparison of two counties
df.loc[ [
            ('Michigan','Washtenaw County'),
            ('Michigan','Wayne County')
            ]
        ]

Unnamed: 0_level_0,Unnamed: 1_level_0,BIRTHS2010,BIRTHS2011,BIRTHS2012,BIRTHS2013,BIRTHS2014,BIRTHS2015,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Michigan,Washtenaw County,977,3826,3780,3662,3683,3709,351213,354289,357029,358880
Michigan,Wayne County,5918,23819,23270,23377,23607,23586,1792514,1775713,1766008,1759335


# Missing values

In [197]:
df = pd.read_csv('class_grades.csv')
df.head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


## is null

In [198]:
mask = df.isnull()
mask.head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,True,False,False
3,False,True,True,False,False,False
4,False,False,False,False,False,False


## Drop na

In [199]:
df.dropna().head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0


## Fillna

In [202]:
# Allows you to change all missing values to one scalar value
df.fillna(0,inplace=True)
df.head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,0.0,63.15,48.89
3,7,0.0,0.0,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


## na_filter

In [None]:
# option use to turn off whitespace filtering
# if whitespace is an actual value of interest (RARE)
# If data is without any NAs, passing na_filter = False can improve performance while reading a large file

In [203]:
df = pd.read_csv('log.csv')
df

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,


## Method parameters - ffill and bfill

- ffill is for forward filling and it updates an NaN for a particular cell with a value from the previous row
- bfill is backward filling
- Data needs to be sorted for this to work

### Sort either by index or timestamp

In [204]:
df = df.set_index('time')
df = df.sort_index()
df

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,
1469974514,cheryl,intro.html,8,,
1469974524,sue,advanced.html,25,,
1469974544,cheryl,intro.html,9,,
1469974554,sue,advanced.html,26,,
1469974574,cheryl,intro.html,10,,


In [205]:
# as we can see timestamp can be same when two users are using the system
# Let's use multi-level indexing


In [336]:
# df = df.reset_index()
# df = df.set_index(['time','user'])
# df.head()

In [207]:
df = df.fillna(method = 'ffill')
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,False,10.0
1469974454,sue,advanced.html,24,False,10.0
1469974484,cheryl,intro.html,7,False,10.0


## customized fill-in using replace()

In [208]:
df = pd.DataFrame({'A':[1,1,2,3,4],
                   'B':[3,6,3,8,9],
                   'C':['a','b','c','d','e']})
df

Unnamed: 0,A,B,C
0,1,3,a
1,1,6,b
2,2,3,c
3,3,8,d
4,4,9,e


In [209]:
# we can replace 1 with 100
df.replace(1,100)

Unnamed: 0,A,B,C
0,100,3,a
1,100,6,b
2,2,3,c
3,3,8,d
4,4,9,e


In [210]:
# Changing 1 to 100, 3 to 300
df.replace([1,3],[100,300])

Unnamed: 0,A,B,C
0,100,300,a
1,100,6,b
2,2,300,c
3,300,8,d
4,4,9,e


In [211]:
df = pd.read_csv('log.csv')
df.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


In [212]:
# replace using regex
# first parameter to replace the pattern we want to match
# second parameter is the value we want to emit upon match
# third parameter = regex = True

In [214]:
df.replace("[s]*.html"," webpage",regex=True)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro webpage,5,False,10.0
1,1469974454,cheryl,intro webpage,6,,
2,1469974544,cheryl,intro webpage,9,,
3,1469974574,cheryl,intro webpage,10,,
4,1469977514,bob,intro webpage,1,,
5,1469977544,bob,intro webpage,1,,
6,1469977574,bob,intro webpage,1,,
7,1469977604,bob,intro webpage,1,,
8,1469974604,cheryl,intro webpage,11,,
9,1469974694,cheryl,intro webpage,14,,


In [215]:
df.replace(".*.html$"," webpage",regex=True)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,webpage,5,False,10.0
1,1469974454,cheryl,webpage,6,,
2,1469974544,cheryl,webpage,9,,
3,1469974574,cheryl,webpage,10,,
4,1469977514,bob,webpage,1,,
5,1469977544,bob,webpage,1,,
6,1469977574,bob,webpage,1,,
7,1469977604,bob,webpage,1,,
8,1469974604,cheryl,webpage,11,,
9,1469974694,cheryl,webpage,14,,


# Manipulating DataFrames - Data cleaning process

In [254]:
df = pd.read_csv('presidents.csv')
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days"
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days"
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days"
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days"
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days"


In [238]:

# Let's separate out firstname and lastname
df['First'] = df['President']

# take anything after space, replace it with nothing
df['First'] = df['First'].replace('[ ].*','',regex=True)
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James


In [239]:
# Alternative -> Apply function

del(df['First'])
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days"
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days"
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days"
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days"
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days"


In [246]:
# Apply function takes some arbitrary function and apply to Series or DataFrame across all rows or columns
# Let's write a function to split the string into two
def splitname(row):
    # Single row series 
    row['First'] = row['President'].split(" ")[0]
    row['Last'] = row['President'].split(" ")[-1]
    return row

In [247]:
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days"
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days"
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days"
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days"
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days"


In [249]:
# Apply this
df = df.apply(splitname, axis='columns')
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
2,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
3,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
4,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington


# It didn't work above, Why?

## Extract function
- Takes a regular expression as input and requires you to set capture groups that correspond to the output columns you are interested in

In [268]:
# My take 
# pattern = "(?P<First>[\w]*)(?:.* )(?P<Last>[\w]*)"

# Answer
pattern = "(^[\w]*)(?:.* )([\w]*$)"
df['President'].str.extract(pattern).head()

Unnamed: 0,0,1
0,George,Washington
1,John,Adams
2,Thomas,Jefferson
3,James,Madison
4,James,Monroe


In [263]:
pattern = "(?P<First>^[\w]*)(?:.* )(?P<Last>[\w]*$)"
names = df['President'].str.extract(pattern).head()

In [264]:
df['First']= names['First']
df['Last']= names['Last']

In [265]:
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,"Feb 22, 1732[a]","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735[a]","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743[a]","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751[a]","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [269]:
# Let's clean the born column
# Let's remove footnotes in the born column

In [277]:
df['Born']=df['Born'].str.extract("(?P<born>[\w]{3} [\w]{1,2}, [\d]{4})").head()

In [278]:
df.head()

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,"Feb 22, 1732","57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,"Oct 30, 1735","61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,"Apr 13, 1743","57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,"Mar 16, 1751","57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,"Apr 28, 1758","58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe


In [282]:
# If we check the type of born column, it's object or str
type(df['Born'][0])

str

In [283]:
# But pandas has datetime feature
df["Born"] = pd.to_datetime(df["Born"])
df["Born"].head()

0   1732-02-22
1   1735-10-30
2   1743-04-13
3   1751-03-16
4   1758-04-28
Name: Born, dtype: datetime64[ns]

# Quiz 2

In [290]:
d = {'1':'a','2':'b','3':'c','4':'d','5':'e'}
s = pd.Series(d)
s

1    a
2    b
3    c
4    d
5    e
dtype: object

In [291]:
s.iloc[0:3]

1    a
2    b
3    c
dtype: object

In [294]:
s.iloc[0:2]

1    a
2    b
dtype: object

Unnamed: 0,#,President,Born,Age atstart of presidency,Age atend of presidency,Post-presidencytimespan,Died,Age,First,Last
0,1,George Washington,1732-02-22,"57 years, 67 daysApr 30, 1789","65 years, 10 daysMar 4, 1797","2 years, 285 days","Dec 14, 1799","67 years, 295 days",George,Washington
1,2,John Adams,1735-10-30,"61 years, 125 daysMar 4, 1797","65 years, 125 daysMar 4, 1801","25 years, 122 days","Jul 4, 1826","90 years, 247 days",John,Adams
2,3,Thomas Jefferson,1743-04-13,"57 years, 325 daysMar 4, 1801","65 years, 325 daysMar 4, 1809","17 years, 122 days","Jul 4, 1826","83 years, 82 days",Thomas,Jefferson
3,4,James Madison,1751-03-16,"57 years, 353 daysMar 4, 1809","65 years, 353 daysMar 4, 1817","19 years, 116 days","Jun 28, 1836","85 years, 104 days",James,Madison
4,5,James Monroe,1758-04-28,"58 years, 310 daysMar 4, 1817","66 years, 310 daysMar 4, 1825","6 years, 122 days","Jul 4, 1831","73 years, 67 days",James,Monroe
5,6,John Quincy Adams,NaT,"57 years, 236 daysMar 4, 1825","61 years, 236 daysMar 4, 1829","18 years, 356 days","Feb 23, 1848","80 years, 227 days",,
6,7,Andrew Jackson,NaT,"61 years, 354 daysMar 4, 1829","69 years, 354 daysMar 4, 1837","8 years, 96 days","Jun 8, 1845","78 years, 85 days",,
7,8,Martin Van Buren,NaT,"54 years, 89 daysMar 4, 1837","58 years, 89 daysMar 4, 1841","21 years, 142 days","Jul 24, 1862","79 years, 231 days",,
8,9,William H. Harrison,NaT,"68 years, 23 daysMar 4, 1841","68 years, 54 days Apr 4, 1841[b]",,"Apr 4, 1841","68 years, 54 days",,
9,10,John Tyler,NaT,"51 years, 6 daysApr 4, 1841","54 years, 340 daysMar 4, 1845","16 years, 320 days","Jan 18, 1862","71 years, 295 days",,


In [297]:
df = pd.read_csv('Admission_Predict.csv',index_col=0)
df.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


In [302]:
df.where(df['TOEFL Score']>105)

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
3,,,,,,,,
4,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.80
5,,,,,,,,
...,...,...,...,...,...,...,...,...
396,324.0,110.0,3.0,3.5,3.5,9.04,1.0,0.82
397,325.0,107.0,3.0,3.0,3.5,9.11,1.0,0.84
398,330.0,116.0,4.0,5.0,4.5,9.45,1.0,0.91
399,,,,,,,,


In [305]:
df.drop('TOEFL Score',axis=1)

Unnamed: 0_level_0,GRE Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,337,4,4.5,4.5,9.65,1,0.92
2,324,4,4.0,4.5,8.87,1,0.76
3,316,3,3.0,3.5,8.00,1,0.72
4,322,3,3.5,2.5,8.67,1,0.80
5,314,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...
396,324,3,3.5,3.5,9.04,1,0.82
397,325,3,3.0,3.5,9.11,1,0.84
398,330,4,5.0,4.5,9.45,1,0.91
399,312,3,3.5,4.0,8.78,0,0.67


In [309]:
s

1    a
2    b
3    c
4    d
5    e
dtype: object

In [317]:
df[(df['GRE Score'] > 300) & (df['GRE Score'] < 330)]

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.00,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.80
5,314,103,2,2.0,3.0,8.21,0,0.65
7,321,109,3,3.0,4.0,8.20,1,0.75
...,...,...,...,...,...,...,...,...
394,317,104,2,3.0,3.0,8.76,0,0.77
395,329,111,4,4.5,4.0,9.23,1,0.89
396,324,110,3,3.5,3.5,9.04,1,0.82
397,325,107,3,3.0,3.5,9.11,1,0.84


In [327]:
di = [{'Major':'Maths','Name':'Alice','Age':20,'Gender':'F'},
      {'Major':'Social','Name':'Jack','Age':25,'Gender':'M'}]
xdf = pd.DataFrame(di)
xdf

Unnamed: 0,Major,Name,Age,Gender
0,Maths,Alice,20,F
1,Social,Jack,25,M


In [329]:
xdf = xdf.set_index('Major')