# Pandas dataframes
the heart of pandas, dataframes can be conceptualized as a table with columns and rows, or more generally as a 2-axes labelled array
 

In [24]:
import numpy as np
import pandas as pd

# one way a dataframe can be constructed is from a list of series
# one way a series can be constructed from a dictionary
record1 = pd.Series({'mrn': '1001', 'metric': 'pain', 'score': 10})
record2 = pd.Series({'mrn': '1002', 'metric': 'hoos', 'score': 85.7})
record3 = pd.Series({'mrn': '1003', 'metric': 'pain', 'score': 5})

print (record1)

# the dataframe index can be passed in using another list as a parameter to the DataFrame function
df = pd.DataFrame ([record1, record2, record3],
                    index=['Dr_A', 'Dr_X', 'Dr_A'])
# df = pd.DataFrame ([record1, record2, record3])

print (df.head())
# without the print command, the dataframe renders with some styling
df.head()


mrn       1001
metric    pain
score       10
dtype: object
       mrn metric  score
Dr_A  1001   pain   10.0
Dr_X  1002   hoos   85.7
Dr_A  1003   pain    5.0


Unnamed: 0,mrn,metric,score
Dr_A,1001,pain,10.0
Dr_X,1002,hoos,85.7
Dr_A,1003,pain,5.0


In [25]:
# data in the dataframe can be accessed using the .loc and .iloc parameters (iloc for integer indexes)
# if there is only one record selected, the result will be returned as a series
print(df.loc['Dr_X'])
print(type(df.loc['Dr_X']))
print('----')
# if more than one row is selected, the result will be returned as a dataframe
print(df.loc['Dr_A'])
print(type(df.loc['Dr_A']))


mrn       1002
metric    hoos
score     85.7
Name: Dr_X, dtype: object
<class 'pandas.core.series.Series'>
----
       mrn metric  score
Dr_A  1001   pain   10.0
Dr_A  1003   pain    5.0
<class 'pandas.core.frame.DataFrame'>


In [28]:
# you can select data based on multi axes
# if two parameters are provided to .loc, the first will be the row selection and the second will be the column
df.loc['Dr_A', 'score']

Dr_A    10.0
Dr_A     5.0
Name: score, dtype: float64

In [34]:
# Selecting a single colunn of data can be achieved in a couple of ways
# to use .loc, you can first transpose the dataframe using the T function then select a single row using .loc
print (df.T.loc['metric'])
print ('----')
# if you use the loc attribute for this without transposing, you will get an ugly index error

# However, the same effect can be achieved by passing in the column name to the dataframe directly
print(df['metric'])

#the result of this is a series object
print(type(df['metric']))

Dr_A    pain
Dr_X    hoos
Dr_A    pain
Name: metric, dtype: object
----
Dr_A    pain
Dr_X    hoos
Dr_A    pain
Name: metric, dtype: object
<class 'pandas.core.series.Series'>


In [47]:
# you can chain operations
# the loc produces a dataframe selects all the rows that match the condition
print(df.loc['Dr_A'])
print(type(df.loc['Dr_A']))
print('----')

#chained with the column name it produces a series of the chosen column
print(df.loc['Dr_A']['score'])
print(type(df.loc['Dr_A']['score']))

# it is better to avoid chaining if you can


       mrn metric  score
Dr_A  1001   pain   10.0
Dr_A  1003   pain    5.0
<class 'pandas.core.frame.DataFrame'>
----
Dr_A    10.0
Dr_A     5.0
Name: score, dtype: float64
<class 'pandas.core.series.Series'>


### beyond chaining

In [62]:
# an alternative to chaining
# the .loc function selects rows and has two parameters
# the row index and the list of column names
# the loc attribute also supports slicing
# here's how to select all of the mrn's and scores from all of the doctors
df.loc[:,['mrn', 'score']]
# the first parameter is a slice of all the rows [:]
df.loc[:,['mrn', 'metric', 'score']]
#to be determined...can you use this first parameter to limit the rows selected
df.loc[:,['mrn', 'metric', 'score']]


Unnamed: 0,mrn,metric,score
Dr_X,1002,hoos,85.7
Dr_A,1003,pain,5.0


## Dropping Data

In [72]:
# the drop function takes a simngle parameter, the row label (index) to drop
# it doesn't actually change the original data frame, it returns a copy of the data frame without the dropped data
print (df)
print ('----')

print(df.drop('Dr_X'))
print ('----')

# drop has two optional parameters
#    inplace (True/False)
#    axis(0=row, 1=column)
# inplace affects the data in the data frame rather than making a copy
# axis specifies row or column

df2 = df.copy()
df2.drop('mrn', inplace=True, axis=1)
print(df2)



       mrn metric  score
Dr_A  1001   pain   10.0
Dr_X  1002   hoos   85.7
Dr_A  1003   pain    5.0
----
       mrn metric  score
Dr_A  1001   pain   10.0
Dr_A  1003   pain    5.0
----
     metric  score
Dr_A   pain   10.0
Dr_X   hoos   85.7
Dr_A   pain    5.0
