# Indexing dataframes

In [362]:
import pandas as pd

In [363]:
# Create a dataframe from a list of dictionaries
students = [{"Name": "John", "Last Name": "Qusak", "Class": "Physics", "Grade":4.5},
            {"Name": "Jack", "Last Name": "Black", "Class": "Chemistry", "Grade":5.5},
            {"Name": "Jill", "Last Name": "Sanders", "Class": "Maths", "Grade":8.5}
           ]

# If index is not set then by default is set to an integer value
df = pd.DataFrame(students)
df

Unnamed: 0,Class,Grade,Last Name,Name
0,Physics,4.5,Qusak,John
1,Chemistry,5.5,Black,Jack
2,Maths,8.5,Sanders,Jill


In [364]:
# Indexing using loc
# Remember loc in Series was used to find Series items by index
# In dataframe loc is used pretty much the same way, in order to find items in the two dimensional structure
# loc searches by row, and each row is identified by index, so passing the value of the index
# in loc attribute retrieves the entire row as Series object
print(df.loc[1])
print(type(df.loc[1]))

Class        Chemistry
Grade              5.5
Last Name        Black
Name              Jack
Name: 1, dtype: object
<class 'pandas.core.series.Series'>


In [365]:
# for selecting specific columns of one row the second parameter of the loc attribute
# may accept a list of the projected columns indentified by column name, i.e label
df.loc[2, ["Last Name", "Class", "Grade"]]

Last Name    Sanders
Class          Maths
Grade            8.5
Name: 2, dtype: object

In [366]:
# We can slice a dataframe by using the loc attribute and the colon (i.e. ':')
df.loc[0:1, ["Last Name", "Class", "Grade"]]

Unnamed: 0,Last Name,Class,Grade
0,Qusak,Physics,4.5
1,Black,Chemistry,5.5


In [367]:
# dropping a columns using the drop attribute
# Points to remember:
# a. drop creates a copy of the original dataframe 
# b. dropping by default occurs on the x axis (or is row based). If we want to drop a column use the axis parameter
# c. the inplace parameter is used to replace the original dataframe
copy_df = df.drop("Name", axis=1)
print(copy_df)
print(df)

       Class  Grade Last Name
0    Physics    4.5     Qusak
1  Chemistry    5.5     Black
2      Maths    8.5   Sanders
       Class  Grade Last Name  Name
0    Physics    4.5     Qusak  John
1  Chemistry    5.5     Black  Jack
2      Maths    8.5   Sanders  Jill


In [368]:
# Or put it nicely, if we do the following drop NOTHING occurs to the original dataframe
df.drop("Name", axis=1)
df

Unnamed: 0,Class,Grade,Last Name,Name
0,Physics,4.5,Qusak,John
1,Chemistry,5.5,Black,Jack
2,Maths,8.5,Sanders,Jill


In [369]:
# BUT this one does
df.drop("Name", inplace=True, axis=1)
df

Unnamed: 0,Class,Grade,Last Name
0,Physics,4.5,Qusak
1,Chemistry,5.5,Black
2,Maths,8.5,Sanders


# Pandas and CSV files (1)

In [370]:
# create dataframe from csv file
df = pd.read_csv("../datasets/Admission_Predict.csv")
df.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [371]:
# iterate all column names and change their value to lowe case
cols = list(df.columns)
# cols
cols = [x.lower().strip() for x in cols]
# cols
# and apply that to the dataframe
# another way to rename columns is using the rename function, which takes a dictionary as attribute
df.columns = cols
df.head()

Unnamed: 0,serial no.,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [372]:
# let's change the index of the dataframe to a specific column, e.g. "serial no."
df_new_index = pd.read_csv("../datasets/Admission_Predict.csv", index_col=0)
df_new_index.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337,118,4,4.5,4.5,9.65,1,0.92
2,324,107,4,4.0,4.5,8.87,1,0.76
3,316,104,3,3.0,3.5,8.0,1,0.72
4,322,110,3,3.5,2.5,8.67,1,0.8
5,314,103,2,2.0,3.0,8.21,0,0.65


# Using Boolean Masks

In [373]:
# Let's create a condition on a column of the dataframe that evaluates to true / false 
# and assign it to a Series object
# Careful on the changed dataframe its not df BUT df_new_index
# See that the index '2' to the row with "Serial No." eaqual to '2'
df_new_index.loc[2]
# additionally see that there is no index zero
# df_new_index.loc[0] # this result to keyError

GRE Score            324.00
TOEFL Score          107.00
University Rating      4.00
SOP                    4.00
LOR                    4.50
CGPA                   8.87
Research               1.00
Chance of Admit        0.76
Name: 2, dtype: float64

In [374]:
# Create a boolean condition on the column e.g. CGPA and assign it to a Series object
# Note: No need to explicitly state that s is a Series becuase one column of a dataframe is a Series
s = df_new_index['CGPA'] > 9.23
s

Serial No.
1       True
2      False
3      False
4      False
5      False
6       True
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23      True
24      True
25      True
26      True
27     False
28     False
29     False
30     False
       ...  
371    False
372    False
373     True
374    False
375    False
376    False
377    False
378    False
379    False
380    False
381    False
382    False
383    False
384    False
385     True
386     True
387    False
388    False
389    False
390    False
391    False
392    False
393    False
394    False
395    False
396    False
397    False
398     True
399    False
400     True
Name: CGPA, Length: 400, dtype: bool

In [375]:
# and now apply the boolean condition to the dataframw
# the where function returns NaN for the rows that are False - i.e. do not fullfil the condition
# so we need to remove those rows to get the final result
# and make the changes permanent with inplace = True
df_new_index.where(s, inplace=True)
df_new_index.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
2,,,,,,,,
3,,,,,,,,
4,,,,,,,,
5,,,,,,,,


In [376]:
# and not drop the NaN
# Note: If we add the dropna() in the previous cell it returns wtih a NoneObject error. Have no idea why...
# Found it! the head() function cannot go together with dropna() and inplace parameter. Still no idea why... 
df_new_index.dropna(inplace=True)
df_new_index.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
6,330.0,115.0,5.0,4.5,3.0,9.34,1.0,0.9
23,328.0,116.0,5.0,5.0,5.0,9.5,1.0,0.94
24,334.0,119.0,5.0,5.0,4.5,9.7,1.0,0.95
25,336.0,119.0,5.0,4.0,3.5,9.8,1.0,0.97


In [377]:
# now let's do filtering based on multiple conditions and using a shortcut notation
# which though is not so clear as where.
df_copy = df_new_index

# find the ones with chance of admission greater than 0.94 AND TOEFL Score greater than 115.0
# Careful on the Parenthesis:
# The problem is that Python is trying to bitwise and a 0.7 and a pandas dataframe, when you really want
# to bitwise and the broadcasted dataframes together
df_copy = df_copy[(df_copy['TOEFL Score'] > 115.0) & (df_copy['Chance of Admit '] > 0.94)]
df_copy.head()

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
24,334.0,119.0,5.0,5.0,4.5,9.7,1.0,0.95
25,336.0,119.0,5.0,4.0,3.5,9.8,1.0,0.97
82,340.0,120.0,4.0,5.0,5.0,9.5,1.0,0.96
144,340.0,120.0,4.0,4.5,4.0,9.92,1.0,0.97
149,339.0,116.0,4.0,4.0,3.5,9.8,1.0,0.96


In [378]:
# Finally let's see the gt and lt operators which are shorthand for the previous notation

# Let's filter even more the df_copy to the ones with GRE Score between 334.0 and 338.0
df_copy = df_copy[df_copy['GRE Score'].gt(334.0) & df_copy['GRE Score'].lt(340.0)]
df_copy

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
25,336.0,119.0,5.0,4.0,3.5,9.8,1.0,0.97
149,339.0,116.0,4.0,4.0,3.5,9.8,1.0,0.96
213,338.0,120.0,4.0,5.0,5.0,9.66,1.0,0.95
373,336.0,119.0,4.0,4.5,4.0,9.62,1.0,0.95
386,335.0,117.0,5.0,5.0,5.0,9.82,1.0,0.96


In [379]:
# another example using attribute between
df_copy = df_copy[df_copy['GRE Score'].between(335.0,337.0)]
df_copy

Unnamed: 0_level_0,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
Serial No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
25,336.0,119.0,5.0,4.0,3.5,9.8,1.0,0.97
373,336.0,119.0,4.0,4.5,4.0,9.62,1.0,0.95
386,335.0,117.0,5.0,5.0,5.0,9.82,1.0,0.96


# Multi - Index dataframes

In [380]:
# create dataframe from csv file
df = pd.read_csv("../datasets/census.csv")
df.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
0,40,3,6,1,0,Alabama,Alabama,4779736,4780127,4785161,...,0.002295,-0.193196,0.381066,0.582002,-0.467369,1.030015,0.826644,1.383282,1.724718,0.712594
1,50,3,6,1,1,Alabama,Autauga County,54571,54571,54660,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183193,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
3,50,3,6,1,5,Alabama,Barbour County,27457,27457,27341,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
4,50,3,6,1,7,Alabama,Bibb County,22915,22919,22861,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861


In [381]:
# Create a multi level index (or an index with two columns) based on STNAME and CTYNAME

# create a list of the columns that will be used as index
indx = ['STNAME','CTYNAME']

# Let's have something clear: The following function, which sets the index in the dataframe
# df does NOT modify the original dataframe; returns only a view. Bare this in mind ALWAYS.
df = df.set_index(indx)

# keep specific rows - if we do not do that we get warning 
# 'PerformanceWarning: indexing past lexsort depth may impact performance.'
df=df[df['SUMLEV'] == 50]
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Autauga County,50,3,6,1,1,54571,54571,54660,55253,55175,...,7.242091,-2.915927,-3.012349,2.265971,-2.530799,7.606016,-2.626146,-2.722002,2.59227,-2.187333
Alabama,Baldwin County,50,3,6,1,3,182265,182265,183193,186659,190396,...,14.83296,17.647293,21.845705,19.243287,17.197872,15.844176,18.559627,22.727626,20.317142,18.293499
Alabama,Barbour County,50,3,6,1,5,27457,27457,27341,27226,27159,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
Alabama,Bibb County,50,3,6,1,7,22915,22919,22861,22733,22642,...,-5.527043,-5.068871,-6.201001,-0.177537,0.177258,-5.088389,-4.363636,-5.403729,0.754533,1.107861
Alabama,Blount County,50,3,6,1,9,57322,57322,57373,57711,57776,...,1.807375,-1.177622,-1.748766,-2.062535,-1.36997,1.859511,-0.84858,-1.402476,-1.577232,-0.884411


In [382]:
# and now search for the state of California 
# Careful again df.loc['California'] will result in error since there is no index called 
# California in the original df
# the label [California] is not in the [index]
# df.loc['Alabama']

# We need to search this in the view / modified dataframe
# The following returns all counties of state California
# df.loc['California']

# and if we want one country only
df.loc['California','San Diego County']

SUMLEV                   5.000000e+01
REGION                   4.000000e+00
DIVISION                 9.000000e+00
STATE                    6.000000e+00
COUNTY                   7.300000e+01
CENSUS2010POP            3.095313e+06
ESTIMATESBASE2010        3.095308e+06
POPESTIMATE2010          3.104386e+06
POPESTIMATE2011          3.142256e+06
POPESTIMATE2012          3.184358e+06
POPESTIMATE2013          3.223645e+06
POPESTIMATE2014          3.265700e+06
POPESTIMATE2015          3.299521e+06
NPOPCHG_2010             9.078000e+03
NPOPCHG_2011             3.787000e+04
NPOPCHG_2012             4.210200e+04
NPOPCHG_2013             3.928700e+04
NPOPCHG_2014             4.205500e+04
NPOPCHG_2015             3.382100e+04
BIRTHS2010               1.071900e+04
BIRTHS2011               4.469400e+04
BIRTHS2012               4.384100e+04
BIRTHS2013               4.374900e+04
BIRTHS2014               4.418400e+04
BIRTHS2015               4.456100e+04
DEATHS2010               4.677000e+03
DEATHS2011  

In [401]:
# And not compare side by side (i.e. row to row) the census data for two counties
# using a list of tuples within the loc attribute. Each tuple corresponds to the 
# multi - index.
df.loc[
    # and now the list of tuples
    [('Alabama', 'Barbour County'), ('California', 'San Diego County')]
]

Unnamed: 0_level_0,Unnamed: 1_level_0,SUMLEV,REGION,DIVISION,STATE,COUNTY,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,...,RDOMESTICMIG2011,RDOMESTICMIG2012,RDOMESTICMIG2013,RDOMESTICMIG2014,RDOMESTICMIG2015,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015
STNAME,CTYNAME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Alabama,Barbour County,50,3,6,1,5,27457,27457,27341,27226,27159,...,-4.728132,-2.50069,-7.056824,-3.904217,-10.543299,-4.874741,-2.758113,-7.167664,-3.978583,-10.543299
California,San Diego County,50,4,9,6,73,3095313,3095308,3104386,3142256,3184358,...,0.354751,0.235513,-0.246567,0.657077,-2.855045,4.382515,5.773072,4.74594,5.921091,2.770965


# Missing Values

In [404]:
df = pd.read_csv('../datasets/class_grades.csv')
df.head(10)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89
5,7,95.0,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.0
7,7,72.85,86.85,60.0,,56.11
8,8,84.26,93.1,47.5,18.52,50.83
9,7,90.1,97.55,51.25,88.89,63.61


In [407]:
# A nice thing is that we can generate a mask dataframe using broadcasting. 
# A mask is an object (e.g. Series or dataframe, whose items are booleans - true / false)
mask = df.isnull()
mask.head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,True,False,False
3,False,True,True,False,False,False
4,False,False,False,False,False,False


In [409]:
# let's upload a new dataset to see ordering of indices and regexes
df = pd.read_csv('../datasets/log.csv')
df.head(10)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,


In [413]:
# let's assume that we want to order it by time

# first let's use 'time' as index
# remember that the set_index returns a copy of the original dataframe (i.e. is a view)
df_copy = df.set_index('time')
# and now order it
df_copy.sort_index()
df_copy.head()

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974454,cheryl,intro.html,6,,
1469974544,cheryl,intro.html,9,,
1469974574,cheryl,intro.html,10,,
1469977514,bob,intro.html,1,,


In [417]:
# and now lets fill the missing (i.e. NaN) values with ffill => reads the last valid value and
# use it to populate the next missing values.
# In this example we expect the values 'False' and 10.0 to be propagated
df_copy = df_copy.fillna(method='ffill')
df_copy.head()

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974454,cheryl,intro.html,6,False,10.0
1469974544,cheryl,intro.html,9,False,10.0
1469974574,cheryl,intro.html,10,False,10.0
1469977514,bob,intro.html,1,False,10.0


In [422]:
# use of regex to replace values
df_with_regex = df
df_with_regex.head()

# replace html with the name 'web'
pattern = '\w*\.html$'
df_with_regex['video'] = df_with_regex['video'].replace(pattern, 'web', regex=True)
df_with_regex.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,web,5,False,10.0
1,1469974454,cheryl,web,6,,
2,1469974544,cheryl,web,9,,
3,1469974574,cheryl,web,10,,
4,1469977514,bob,web,1,,


In [423]:
# let's manipulate time now

# the column time is in epoch, let's transform it to a timestamp yyyy-mm-dd hh:mm:ss using datetime library
import datetime as dt

In [429]:
# unit denotes whether the epoch is counted in seconds, mseconds, etc.
df_with_regex['time'] = pd.to_datetime(df_with_regex['time'], unit='s')
df_with_regex.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,2016-07-31 14:13:44,cheryl,web,5,False,10.0
1,2016-07-31 14:14:14,cheryl,web,6,,
2,2016-07-31 14:15:44,cheryl,web,9,,
3,2016-07-31 14:16:14,cheryl,web,10,,
4,2016-07-31 15:05:14,bob,web,1,,


In [434]:
# and now let's use apply function to change the value of the user to e.g. Camelcase

# define a function
# the row parameter is a Series object representing a row of the dataframe
def user_to_camelcase(row):
    row['user'] = row['user'].title()
    return row

# Careful: the apply method is used on a dataframe not on a Series object. 
# The following is wrong
# df_with_regex['user'] = df_with_regex['user'].apply(user_to_camelcase, axis='columns')
df_with_regex = df_with_regex.apply(user_to_camelcase, axis='columns')
df_with_regex.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,2016-07-31 14:13:44,Cheryl,web,5,False,10.0
1,2016-07-31 14:14:14,Cheryl,web,6,,
2,2016-07-31 14:15:44,Cheryl,web,9,,
3,2016-07-31 14:16:14,Cheryl,web,10,,
4,2016-07-31 15:05:14,Bob,web,1,,


In [436]:
import pandas as pd
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj1 = pd.Series(sdata)
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj2 = pd.Series(sdata, index=states)
obj3 = pd.isnull(obj2)

In [438]:
import math
math.isnan(obj2['California'])

True

In [440]:
x = obj2['California']
obj2['California'] != x

True

In [441]:
obj3['California']

True

In [1]:
obj2['California'] == None

NameError: name 'obj2' is not defined

In [39]:
import pandas as pd
    
df = pd.read_csv('../../../assignments/assignment2/assets/NISPUF17.csv')
# df['EDUC1'].head()
# df['EDUC1'].unique
s = df['EDUC1'].value_counts() / df['EDUC1'].value_counts().sum()
labels = {
    1:'less than high school',
    2:'high school',
    3:'more than high school but not college',
    4:'college'
}
# s2 = s.sort_index()
# s2
s2 = s.rename(labels)
s2 = s2.sort_values()
s2
# df['EDUC1'].sum()
# s1.to_dict()
# s2.to_dict()

less than high school                    0.102020
high school                              0.172352
more than high school but not college    0.245881
college                                  0.479747
Name: EDUC1, dtype: float64