# Pandas dataframes
the heart of pandas, dataframes can be conceptualized as a table with columns and rows, or more generally as a 2-axes labelled array
 

In [24]:
import numpy as np
import pandas as pd

# one way a dataframe can be constructed is from a list of series
# one way a series can be constructed from a dictionary
record1 = pd.Series({'mrn': '1001', 'metric': 'pain', 'score': 10})
record2 = pd.Series({'mrn': '1002', 'metric': 'hoos', 'score': 85.7})
record3 = pd.Series({'mrn': '1003', 'metric': 'pain', 'score': 5})

print (record1)

# the dataframe index can be passed in using another list as a parameter to the DataFrame function
df = pd.DataFrame ([record1, record2, record3],
                    index=['Dr_A', 'Dr_X', 'Dr_A'])
# df = pd.DataFrame ([record1, record2, record3])

print (df.head())
# without the print command, the dataframe renders with some styling
df.head()


mrn       1001
metric    pain
score       10
dtype: object
       mrn metric  score
Dr_A  1001   pain   10.0
Dr_X  1002   hoos   85.7
Dr_A  1003   pain    5.0


Unnamed: 0,mrn,metric,score
Dr_A,1001,pain,10.0
Dr_X,1002,hoos,85.7
Dr_A,1003,pain,5.0


## Accessing data in the dataframe by loc and iloc

In [25]:
# data in the dataframe can be accessed using the .loc and .iloc parameters (iloc for integer indexes)
# if there is only one record selected, the result will be returned as a series
print(df.loc['Dr_X'])
print(type(df.loc['Dr_X']))
print('----')
# if more than one row is selected, the result will be returned as a dataframe
print(df.loc['Dr_A'])
print(type(df.loc['Dr_A']))


mrn       1002
metric    hoos
score     85.7
Name: Dr_X, dtype: object
<class 'pandas.core.series.Series'>
----
       mrn metric  score
Dr_A  1001   pain   10.0
Dr_A  1003   pain    5.0
<class 'pandas.core.frame.DataFrame'>


In [28]:
# you can select data based on multi axes
# if two parameters are provided to .loc, the first will be the row selection and the second will be the column
df.loc['Dr_A', 'score']

Dr_A    10.0
Dr_A     5.0
Name: score, dtype: float64

In [34]:
# Selecting a single colunn of data can be achieved in a couple of ways
# to use .loc, you can first transpose the dataframe using the T function then select a single row using .loc
print (df.T.loc['metric'])
print ('----')
# if you use the loc attribute for this without transposing, you will get an ugly index error

# However, the same effect can be achieved by passing in the column name to the dataframe directly
print(df['metric'])

#the result of this is a series object
print(type(df['metric']))

Dr_A    pain
Dr_X    hoos
Dr_A    pain
Name: metric, dtype: object
----
Dr_A    pain
Dr_X    hoos
Dr_A    pain
Name: metric, dtype: object
<class 'pandas.core.series.Series'>


In [47]:
# you can chain operations
# the loc produces a dataframe selects all the rows that match the condition
print(df.loc['Dr_A'])
print(type(df.loc['Dr_A']))
print('----')

#chained with the column name it produces a series of the chosen column
print(df.loc['Dr_A']['score'])
print(type(df.loc['Dr_A']['score']))

# it is better to avoid chaining if you can


       mrn metric  score
Dr_A  1001   pain   10.0
Dr_A  1003   pain    5.0
<class 'pandas.core.frame.DataFrame'>
----
Dr_A    10.0
Dr_A     5.0
Name: score, dtype: float64
<class 'pandas.core.series.Series'>


### beyond chaining

In [62]:
# an alternative to chaining
# the .loc function selects rows and has two parameters
# the row index and the list of column names
# the loc attribute also supports slicing
# here's how to select all of the mrn's and scores from all of the doctors
df.loc[:,['mrn', 'score']]
# the first parameter is a slice of all the rows [:]
df.loc[:,['mrn', 'metric', 'score']]
#to be determined...can you use this first parameter to limit the rows selected
df.loc[:,['mrn', 'metric', 'score']]


Unnamed: 0,mrn,metric,score
Dr_X,1002,hoos,85.7
Dr_A,1003,pain,5.0


## Dropping Data

In [72]:
# the drop function takes a simngle parameter, the row label (index) to drop
# it doesn't actually change the original data frame, it returns a copy of the data frame without the dropped data
print (df)
print ('----')

print(df.drop('Dr_X'))
print ('----')

# drop has two optional parameters
#    inplace (True/False)
#    axis(0=row, 1=column)
# inplace affects the data in the data frame rather than making a copy
# axis specifies row or column

df2 = df.copy()
df2.drop('mrn', inplace=True, axis=1)
print(df2)



       mrn metric  score
Dr_A  1001   pain   10.0
Dr_X  1002   hoos   85.7
Dr_A  1003   pain    5.0
----
       mrn metric  score
Dr_A  1001   pain   10.0
Dr_A  1003   pain    5.0
----
     metric  score
Dr_A   pain   10.0
Dr_X   hoos   85.7
Dr_A   pain    5.0


# Querying Dataframes
## boolean masking
Conceptually, a boolean dataframe is overlaid on another dataframe and any cells with a true result are included in the resulting dataframe

In [6]:
# following the example in pandas_load notebook, open a csv dataset and clean up the column names using a list comprehension
import pandas as pd 
df = pd.read_csv('../resources/week-2/datasets/Admission_Predict.csv')
df.columns = [x.lower().strip() for x in df.columns]
df.head()


Unnamed: 0,serial no.,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [9]:
# creating the boolean mask 
# project the chance of admit as values > .7 and broadcast this to the chance of admit column 
admit_mask = df['chance of admit'] > 0.7
admit_mask.head()

0     True
1     True
2     True
3     True
4    False
Name: chance of admit, dtype: bool

In [10]:
# to apply this mask to the original dataframe use the .where function
df.where(admit_mask)

Unnamed: 0,serial no.,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
1,2.0,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
2,3.0,316.0,104.0,3.0,3.0,3.5,8.00,1.0,0.72
3,4.0,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.80
4,,,,,,,,,
...,...,...,...,...,...,...,...,...,...
395,396.0,324.0,110.0,3.0,3.5,3.5,9.04,1.0,0.82
396,397.0,325.0,107.0,3.0,3.0,3.5,9.11,1.0,0.84
397,398.0,330.0,116.0,4.0,5.0,4.5,9.45,1.0,0.91
398,,,,,,,,,


In [13]:
# all the data that meets the condition are returned. NaN is returned for data that do not meet the condition
# NaN data can be dropped using the dropna function
df.where(admit_mask).dropna()

Unnamed: 0,serial no.,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
0,1.0,337.0,118.0,4.0,4.5,4.5,9.65,1.0,0.92
1,2.0,324.0,107.0,4.0,4.0,4.5,8.87,1.0,0.76
2,3.0,316.0,104.0,3.0,3.0,3.5,8.00,1.0,0.72
3,4.0,322.0,110.0,3.0,3.5,2.5,8.67,1.0,0.80
5,6.0,330.0,115.0,5.0,4.5,3.0,9.34,1.0,0.90
...,...,...,...,...,...,...,...,...,...
394,395.0,329.0,111.0,4.0,4.5,4.0,9.23,1.0,0.89
395,396.0,324.0,110.0,3.0,3.5,3.5,9.04,1.0,0.82
396,397.0,325.0,107.0,3.0,3.0,3.5,9.11,1.0,0.84
397,398.0,330.0,116.0,4.0,5.0,4.5,9.45,1.0,0.91


In [15]:
# often times .where().dropna() is not used in favor of a shorthand notation 
# this is probably more difficult to read, but you will encounter this in other's code
df[df['chance of admit'] > 0.7].head()

Unnamed: 0,serial no.,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
5,6,330,115,5,4.5,3.0,9.34,1,0.9


## multiple boolean masking
'and' and 'or' are not understood by pandas series, so you need to use the '&' and '|' operators to accomplish this 

In [19]:
df[(df['chance of admit'] > 0.7) & (df['chance of admit'] < 0.9)]
# important to note that parentheses around the individual terms you are interested in are required in this formulation
# this is a quirk of python which does not natively know how to do bitwise comparison on a pandas dataframe

# pandas also has some functions that can be used as an alternative to this
# gt() is 'greater than' and lt() is 'less than'
# these functions can be chained, so the statement above can be rewritten as
df[df['chance of admit'].gt(0.7).lt(0.9)]

Unnamed: 0,serial no.,gre score,toefl score,university rating,sop,lor,cgpa,research,chance of admit
4,5,314,103,2,2.0,3.0,8.21,0,0.65
7,8,308,101,2,3.0,4.0,7.90,0,0.68
8,9,302,102,1,2.0,1.5,8.00,0,0.50
9,10,323,108,3,3.5,3.0,8.60,0,0.45
10,11,325,106,3,3.5,4.0,8.40,1,0.52
...,...,...,...,...,...,...,...,...,...
386,387,302,101,2,2.5,3.5,7.96,0,0.46
387,388,307,105,2,2.0,3.5,8.10,0,0.53
388,389,296,97,2,1.5,2.0,7.80,0,0.49
390,391,314,102,2,2.0,2.5,8.24,0,0.64
