# Exercise 1 in MSDS - Using pandas and a Bit of numPy
### Cory Nichols
##### notebooks allow us to interact with web browser and write Python
##### can call with ipython notebook (deprecated) or jupyter notebook
##### julia python and r = JuPyteR

In [2]:
# Jupyter standards for: Julia, Python and R - Julia not ready for use in academic setting as of 04/28/16
# some pure python to load and loop through first five records and header
with open('Data/heart_disease.csv') as fid:
    for idx, row in enumerate(fid): # enumerate over all rows, number or index each line
        print idx, row,
        if idx > 4:
            break

0 site,age,is_male,chest_pain,rest_blood_press,cholesterol,high_blood_sugar,rest_ecg,max_heart_rate,exer_angina,ST_depression,Peak_ST_seg,major_vessels,thal,has_heart_disease
1 cleve,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
2 cleve,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
3 cleve,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
4 cleve,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
5 cleve,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [4]:
# now with pandas
# pandas dataframes and matrices will be used, matrices are structured representations of data
# represent table of data using a matrix as well
# pandas uses NumPy under the hood
import pandas as pd # this imports the pandas library

df = pd.read_csv('data/heart_disease.csv') # read in the csv file into df, right to left

In [5]:
# lets look at the data, all entries of patients that may or may not have heart disease
# has heart disease, varying levels of heart disease, this is the classifier
# 0-4, 4 being most severe level of heart disease
df.head(10)

Unnamed: 0,site,age,is_male,chest_pain,rest_blood_press,cholesterol,high_blood_sugar,rest_ecg,max_heart_rate,exer_angina,ST_depression,Peak_ST_seg,major_vessels,thal,has_heart_disease
0,cleve,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,cleve,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,cleve,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,cleve,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,cleve,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0
5,cleve,56,1,2,120,236,0,0,178,0,0.8,1,0,3,0
6,cleve,62,0,4,140,268,0,2,160,0,3.6,3,2,3,3
7,cleve,57,0,4,120,354,0,0,163,1,0.6,1,0,3,0
8,cleve,63,1,4,130,254,0,2,147,0,1.4,2,1,7,2
9,cleve,53,1,4,140,203,1,2,155,1,3.1,3,0,7,1


In [4]:
# lets get a summary of variables, df.info() callable gives us count, type and data type
# most records default to objects in pandas
# objects in pandas represent categorical variable
# most of the data is saved as an integer or as a nominal object
# categorical objects default to object
# a lot of the variables should be continuous floating point objects
print df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 15 columns):
site                 920 non-null object
age                  920 non-null int64
is_male              920 non-null int64
chest_pain           920 non-null int64
rest_blood_press     920 non-null object
cholesterol          920 non-null object
high_blood_sugar     920 non-null object
rest_ecg             920 non-null object
max_heart_rate       920 non-null object
exer_angina          920 non-null object
ST_depression        920 non-null object
Peak_ST_seg          920 non-null object
major_vessels        920 non-null object
thal                 920 non-null object
has_heart_disease    920 non-null int64
dtypes: int64(4), object(11)
memory usage: 115.0+ KB
None


In [7]:
# read in data from sql database
# can use pd.read_sql to read a sql query into a DataFrame or pd.to_sql to load data to database
import sqlite3
# create database at data/ with name heart_disease_sql
con = sqlite3.connect('data/heart_disease_sql') 
df = pd.read_sql('SELECT * FROM heart_disease', con) # we've now overwritten the previous df, pass in connection
print df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 16 columns):
index                920 non-null int64
site                 920 non-null object
age                  920 non-null int64
is_male              920 non-null int64
chest_pain           920 non-null int64
rest_blood_press     920 non-null object
cholesterol          920 non-null object
high_blood_sugar     920 non-null object
rest_ecg             920 non-null object
max_heart_rate       920 non-null object
exer_angina          920 non-null object
ST_depression        920 non-null object
Peak_ST_seg          920 non-null object
major_vessels        920 non-null object
thal                 920 non-null object
has_heart_disease    920 non-null int64
dtypes: int64(5), object(11)
memory usage: 122.2+ KB
None


Unnamed: 0,index,site,age,is_male,chest_pain,rest_blood_press,cholesterol,high_blood_sugar,rest_ecg,max_heart_rate,exer_angina,ST_depression,Peak_ST_seg,major_vessels,thal,has_heart_disease
0,0,cleve,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,1,cleve,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,2,cleve,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,3,cleve,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,4,cleve,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [8]:
# can index into dataframe with names of variables or use dot notation
print df['age']
print df.age
# can use compound indices to index into the DataFrame
our_vars = ['age', 'is_male','chest_pain']
df[our_vars]

0      63
1      67
2      67
3      37
4      41
5      56
6      62
7      57
8      63
9      53
10     57
11     56
12     56
13     44
14     52
15     57
16     48
17     54
18     48
19     49
20     64
21     58
22     58
23     58
24     60
25     50
26     58
27     66
28     43
29     40
       ..
890    52
891    53
892    53
893    54
894    55
895    55
896    55
897    56
898    56
899    56
900    58
901    59
902    59
903    65
904    66
905    41
906    43
907    44
908    47
909    47
910    49
911    49
912    50
913    50
914    52
915    52
916    54
917    56
918    58
919    65
Name: age, dtype: int64
0      63
1      67
2      67
3      37
4      41
5      56
6      62
7      57
8      63
9      53
10     57
11     56
12     56
13     44
14     52
15     57
16     48
17     54
18     48
19     49
20     64
21     58
22     58
23     58
24     60
25     50
26     58
27     66
28     43
29     40
       ..
890    52
891    53
892    53
893    54
894    55
895   

Unnamed: 0,age,is_male,chest_pain
0,63,1,1
1,67,1,4
2,67,1,4
3,37,1,3
4,41,0,2
5,56,1,2
6,62,0,4
7,57,0,4
8,63,1,4
9,53,1,4


In [12]:
print df.age # dot notation
print df['age'] # index notation
print 'mean age is:', df.age.mean()

0      63
1      67
2      67
3      37
4      41
5      56
6      62
7      57
8      63
9      53
10     57
11     56
12     56
13     44
14     52
15     57
16     48
17     54
18     48
19     49
20     64
21     58
22     58
23     58
24     60
25     50
26     58
27     66
28     43
29     40
       ..
890    52
891    53
892    53
893    54
894    55
895    55
896    55
897    56
898    56
899    56
900    58
901    59
902    59
903    65
904    66
905    41
906    43
907    44
908    47
909    47
910    49
911    49
912    50
913    50
914    52
915    52
916    54
917    56
918    58
919    65
Name: age, dtype: int64
0      63
1      67
2      67
3      37
4      41
5      56
6      62
7      57
8      63
9      53
10     57
11     56
12     56
13     44
14     52
15     57
16     48
17     54
18     48
19     49
20     64
21     58
22     58
23     58
24     60
25     50
26     58
27     66
28     43
29     40
       ..
890    52
891    53
892    53
893    54
894    55
895   

In [10]:
# print summary statistics for chest_pain variable in dataframe
# print mins and maxes
# if variable is object, the median wont calculate
print df.chest_pain.min(), 'max:', df.chest_pain.max(), 'median:', df.chest_pain.median(), 'mean:', df.chest_pain.mean(),\
'std:', df.chest_pain.std()
df.info()

1 max: 4 median: 4.0 mean: 3.25 std: 0.930968816866
<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 16 columns):
index                920 non-null int64
site                 920 non-null object
age                  920 non-null int64
is_male              920 non-null int64
chest_pain           920 non-null int64
rest_blood_press     920 non-null object
cholesterol          920 non-null object
high_blood_sugar     920 non-null object
rest_ecg             920 non-null object
max_heart_rate       920 non-null object
exer_angina          920 non-null object
ST_depression        920 non-null object
Peak_ST_seg          920 non-null object
major_vessels        920 non-null object
thal                 920 non-null object
has_heart_disease    920 non-null int64
dtypes: int64(5), object(11)
memory usage: 122.2+ KB


In [11]:
# let's eliminate the site variable and the index
# we can check if site is in the df with simple conditionals:
dels = ['index','site']
for i in dels:
    if i in df:
        del df[i]
df.info()
# if so, delete

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 14 columns):
age                  920 non-null int64
is_male              920 non-null int64
chest_pain           920 non-null int64
rest_blood_press     920 non-null object
cholesterol          920 non-null object
high_blood_sugar     920 non-null object
rest_ecg             920 non-null object
max_heart_rate       920 non-null object
exer_angina          920 non-null object
ST_depression        920 non-null object
Peak_ST_seg          920 non-null object
major_vessels        920 non-null object
thal                 920 non-null object
has_heart_disease    920 non-null int64
dtypes: int64(4), object(10)
memory usage: 107.8+ KB


### let's use df.replace and some numpy to transform the data types of variables in our dataframe directly

In [12]:
# encode the variables properly
import numpy as np

# replace ? with -1, returns a new dataframe with all values of ? replaced with -1
df = df.replace(to_replace = '?', value = -1)

# now let's use NumPy to change the types of data
# start by first changing the numeric values to be floats
# let's identify continuous features 
continuous_features = ['rest_blood_press', 'cholesterol', 'max_heart_rate', 'ST_depression']

# and the ordinal values to be integers
# values with ordering, can only take on so many values, not infinite
ordinal_features = ['age', 'chest_pain', 'thal', 'rest_ecg', 'Peak_ST_seg', 'major_vessels', 'has_heart_disease']

# categorical features
# no ordering, represent them as categories or objects
categ_features = ['is_male', 'high_blood_sugar', 'exer_angina']

# use the astype function to change the variable type with types from numpy series
# indexing into the existing dataframe using the lists above, convert them to continuous floating point values
# then save them into themselves, short handed notation
df[continuous_features] = df[continuous_features].astype(np.float64)
df[ordinal_features] = df[ordinal_features].astype(np.int64)
df[categ_features] = df[categ_features].astype(np.object)

# let's check to see if our transformations worked:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 14 columns):
age                  920 non-null int64
is_male              920 non-null object
chest_pain           920 non-null int64
rest_blood_press     920 non-null float64
cholesterol          920 non-null float64
high_blood_sugar     920 non-null object
rest_ecg             920 non-null int64
max_heart_rate       920 non-null float64
exer_angina          920 non-null object
ST_depression        920 non-null float64
Peak_ST_seg          920 non-null int64
major_vessels        920 non-null int64
thal                 920 non-null int64
has_heart_disease    920 non-null int64
dtypes: float64(4), int64(7), object(3)
memory usage: 107.8+ KB


In [27]:
df.head()

Unnamed: 0,age,is_male,chest_pain,rest_blood_press,cholesterol,high_blood_sugar,rest_ecg,max_heart_rate,exer_angina,ST_depression,Peak_ST_seg,major_vessels,thal,has_heart_disease
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,2
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [28]:
# get summary stats for the dataframe by variable:
df.describe()

Unnamed: 0,age,chest_pain,rest_blood_press,cholesterol,rest_ecg,max_heart_rate,ST_depression,Peak_ST_seg,major_vessels,thal,has_heart_disease
count,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0,920.0
mean,53.51087,3.25,123.594565,192.604348,0.601087,129.263043,0.752174,0.840217,-0.436957,1.871739,0.995652
std,9.424685,0.930969,37.484705,114.615011,0.808415,41.376773,1.154353,1.403211,0.959656,3.313649,1.142693
min,28.0,1.0,-1.0,-1.0,-1.0,-1.0,-2.6,-1.0,-1.0,-1.0,0.0
25%,47.0,3.0,120.0,164.0,0.0,115.0,0.0,-1.0,-1.0,-1.0,0.0
50%,54.0,4.0,130.0,221.0,0.0,138.0,0.2,1.0,-1.0,-1.0,1.0
75%,60.0,4.0,140.0,267.0,1.0,156.0,1.5,2.0,0.0,6.0,2.0
max,77.0,4.0,200.0,603.0,2.0,202.0,6.2,3.0,3.0,7.0,4.0


In [14]:
# replace with a not a number value to ignore missing values
# nan values are not used in statistics calculations, however, they will show missing in info
df = df.replace(to_replace=-1, value = np.nan)
df.is_male = df.is_male.astype(np.object) # using dot notation

print df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 14 columns):
age                  920 non-null int64
is_male              920 non-null object
chest_pain           920 non-null int64
rest_blood_press     861 non-null float64
cholesterol          890 non-null float64
high_blood_sugar     830 non-null object
rest_ecg             918 non-null float64
max_heart_rate       865 non-null float64
exer_angina          865 non-null object
ST_depression        856 non-null float64
Peak_ST_seg          611 non-null float64
major_vessels        309 non-null float64
thal                 434 non-null float64
has_heart_disease    920 non-null int64
dtypes: float64(8), int64(3), object(3)
memory usage: 107.8+ KB
None


Unnamed: 0,age,chest_pain,rest_blood_press,cholesterol,rest_ecg,max_heart_rate,ST_depression,Peak_ST_seg,major_vessels,thal,has_heart_disease
count,920.0,920.0,861.0,890.0,918.0,865.0,856.0,611.0,309.0,434.0,920.0
mean,53.51087,3.25,132.132404,199.130337,0.604575,137.545665,0.883178,1.770867,0.676375,5.087558,0.995652
std,9.424685,0.930969,19.06607,110.78081,0.805827,25.926276,1.088707,0.619256,0.935653,1.919075,1.142693
min,28.0,1.0,0.0,0.0,0.0,60.0,-2.6,1.0,0.0,3.0,0.0
25%,47.0,3.0,120.0,175.0,0.0,120.0,0.0,1.0,0.0,3.0,0.0
50%,54.0,4.0,130.0,223.0,0.0,140.0,0.5,2.0,0.0,6.0,1.0
75%,60.0,4.0,140.0,268.0,1.0,157.0,1.5,2.0,1.0,7.0,2.0
max,77.0,4.0,200.0,603.0,2.0,202.0,6.2,3.0,3.0,7.0,4.0


In [15]:
# for all numeric variables only, calculates median and returns a series or array
# returns a pandas series object, can store it in a variable or pass it in functions
df.median()
# can use this to replace missing values

age                   54.0
is_male                1.0
chest_pain             4.0
rest_blood_press     130.0
cholesterol          223.0
high_blood_sugar       0.0
rest_ecg               0.0
max_heart_rate       140.0
exer_angina            0.0
ST_depression          0.5
Peak_ST_seg            2.0
major_vessels          0.0
thal                   6.0
has_heart_disease      1.0
dtype: float64

In [34]:
# imputation, fill in missing values
# fillna will take series and fill missing values for column related in the series
# in this case, the df.median() series will align directly to the values in the data frame
# and fill nans with the median of the respective attribute value, pretty cool!
# fill nans with median values
# if i have missing values inside of columns, replace nulls with median for that attribute
df_imputed = df.fillna(df.median()) # all values must be numeric, fill all nas for that variable with the median
# for that variable
df_imputed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 14 columns):
age                  920 non-null int64
is_male              920 non-null int64
chest_pain           920 non-null int64
rest_blood_press     920 non-null float64
cholesterol          920 non-null float64
high_blood_sugar     920 non-null object
rest_ecg             920 non-null float64
max_heart_rate       920 non-null float64
exer_angina          920 non-null object
ST_depression        920 non-null float64
Peak_ST_seg          920 non-null float64
major_vessels        920 non-null float64
thal                 920 non-null float64
has_heart_disease    920 non-null int64
dtypes: float64(8), int64(4), object(2)
memory usage: 107.8+ KB


In [17]:
# let's impute a little better..... use a mix of feature statistics
# get the mean of the continuous features identified above
series_mean = df[continuous_features].mean()
# get the median of the ordinal and categorical features above (concatenate the lists, get the median)
series_median = df[ordinal_features + categ_features].median() 
# for objects, this will grab the most frequent attribute, for integers, this will calculate real median value
# create a concatenated series by passing a tuple of series to pd.concat()
concat_series = pd.concat((series_median, series_mean)) 
# pass a tuple of median and mean values and combine into one series, use to replace nans to be more specific
print concat_series
# these are aggregate values for each variable, we will subsequently use these in df.fillna to fill nans with 
# more appropriate values

age                   54.000000
chest_pain             4.000000
thal                   6.000000
rest_ecg               0.000000
Peak_ST_seg            2.000000
major_vessels          0.000000
has_heart_disease      1.000000
is_male                1.000000
high_blood_sugar       0.000000
exer_angina            0.000000
rest_blood_press     132.132404
cholesterol          199.130337
max_heart_rate       137.545665
ST_depression          0.883178
dtype: float64


In [18]:
# here, we create a new data frame, df_imputed, whereby we replace the nan values with the concat_series value
# for each respective variable, for instance, nans in age will be replaced with the mean of age from concat_series
df_imputed = df.fillna(value = concat_series)
df_imputed.info()
df[categ_features].describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 14 columns):
age                  920 non-null int64
is_male              920 non-null int64
chest_pain           920 non-null int64
rest_blood_press     920 non-null float64
cholesterol          920 non-null float64
high_blood_sugar     920 non-null object
rest_ecg             920 non-null float64
max_heart_rate       920 non-null float64
exer_angina          920 non-null object
ST_depression        920 non-null float64
Peak_ST_seg          920 non-null float64
major_vessels        920 non-null float64
thal                 920 non-null float64
has_heart_disease    920 non-null int64
dtypes: float64(8), int64(4), object(2)
memory usage: 107.8+ KB


Unnamed: 0,is_male,high_blood_sugar,exer_angina
count,920,830,865
unique,2,2,2
top,1,0,0
freq,726,692,528


In [None]:
# however, our categorical, is_male, was changed to an int, let's change it back
df_imputed.is_male = df.is_male.astype(np.object)
df_imputed[categ_features].describe()
df_imputed.head()
# most frequent categories, how many unique values, counts, most frequent of top value (726/920 are male)

In [37]:
# heart disease descriptive data in one line of code, for folks who do NOT have heart disease
# nest the conditional inside of the data frame df[df.attribute conditional].method (or no method to set up new
# filtered df)
# 411 / 920 45% have no heart disease
df_imputed[df_imputed.has_heart_disease == 0].describe()
# now we can see summary statistics for everyone who does not have heart disease

Unnamed: 0,age,is_male,chest_pain,rest_blood_press,cholesterol,rest_ecg,max_heart_rate,ST_depression,Peak_ST_seg,major_vessels,thal,has_heart_disease
count,411.0,411.0,411.0,411.0,411.0,411.0,411.0,411.0,411.0,411.0,411.0,411
mean,50.547445,0.649635,2.761557,130.021042,226.575368,0.547445,148.25283,0.441963,1.729927,0.111922,5.085158,0
std,9.4337,0.477666,0.903425,16.460208,74.301504,0.805204,23.152969,0.704565,0.515662,0.427276,1.510951,0
min,28.0,0.0,1.0,80.0,0.0,0.0,69.0,-1.1,1.0,0.0,3.0,0
25%,43.0,0.0,2.0,120.0,199.130337,0.0,135.5,0.0,1.0,0.0,3.0,0
50%,51.0,1.0,3.0,130.0,225.0,0.0,150.0,0.0,2.0,0.0,6.0,0
75%,57.0,1.0,4.0,140.0,266.0,1.0,165.0,0.883178,2.0,0.0,6.0,0
max,76.0,1.0,4.0,190.0,564.0,2.0,202.0,4.2,3.0,3.0,7.0,0


In [19]:
# group by is like pivoting in microsoft excel
# group rows where has heart disease have similar values (0, 1, 2, 3, 4), get the median for each attribute
# this will set up a dataframe grouped by each level of has_heart_disease with all other attributes' median
# median for every single grouping of the row
df_imputed.groupby(by= 'has_heart_disease').median()
# df_imputed.groupby(by= 'has_heart_disease').mean()
# df_imputed.groupby(by= 'has_heart_disease').std()
# etc....

Unnamed: 0_level_0,age,is_male,chest_pain,rest_blood_press,cholesterol,rest_ecg,max_heart_rate,ST_depression,Peak_ST_seg,major_vessels,thal
has_heart_disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,9.4337,0.477666,0.903425,16.460208,74.301504,0.805204,23.152969,0.704565,0.515662,0.427276,1.510951
1,8.740371,0.317446,0.813023,19.354224,122.515853,0.769041,22.859633,0.998422,0.443071,0.507604,1.118456
2,7.786852,0.289996,0.678664,16.646713,130.589492,0.78769,22.168355,1.1496,0.550847,0.794883,1.227655
3,7.990594,0.264252,0.664621,23.060681,126.262844,0.816209,24.642309,1.247074,0.530668,0.90464,1.191725
4,8.283661,0.262265,0.645497,21.026685,114.455299,0.862965,23.180536,1.309892,0.590937,1.19744,1.020297


In [21]:
# group by has_heart_disease > 0 (in this case all patients who have heart disease will be True, all else False)
# and get the mean across all of the variables
# this will give a boolean grouping back, True or False
df_imputed.groupby(by = df_imputed.has_heart_disease > 0).mean()
# look at only chest pain
#df_imputed.groupby(by=df_imputed.has_heart_disease >0)['chest_pain'].mean()


Unnamed: 0_level_0,age,is_male,chest_pain,rest_blood_press,cholesterol,rest_ecg,max_heart_rate,ST_depression,Peak_ST_seg,major_vessels,thal,has_heart_disease
has_heart_disease,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
False,50.547445,0.649635,2.761557,130.021042,226.575368,0.547445,148.25283,0.441963,1.729927,0.111922,5.085158,0.0
True,55.903733,0.901768,3.644401,133.837257,176.969418,0.64833,128.899997,1.239443,1.943026,0.320236,5.960707,1.799607


In [23]:
df_imputed.groupby( by = df_imputed.major_vessels>2).mean()
# results in a boolean categorization based on if major_vessels > 2

Unnamed: 0_level_0,age,is_male,chest_pain,rest_blood_press,cholesterol,rest_ecg,max_heart_rate,ST_depression,Peak_ST_seg,major_vessels,thal,has_heart_disease
major_vessels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
False,53.51087,0.78913,3.25,132.132404,199.130337,0.603261,137.545665,0.883178,1.847826,0.227174,5.569565,0.995652


## Calling R from iPython and Using R / Dataframes
### Utilizing magics in iPython
#### call magics directly with %, must be installed in PythonPath

In [24]:
# calling R from python, these are called magics and are loaded with %load_ext in ipython
# these are add ons to iPython
# really only supported well for mac and linux, be careful once you load!
%load_ext rmagic
%load_ext rpy2.ipython 

df_colnames = df_imputed.columns



In [25]:
# python list of all the column names inside of the original data frame
df_colnames

Index([u'age', u'is_male', u'chest_pain', u'rest_blood_press', u'cholesterol',
       u'high_blood_sugar', u'rest_ecg', u'max_heart_rate', u'exer_angina',
       u'ST_depression', u'Peak_ST_seg', u'major_vessels', u'thal',
       u'has_heart_disease'],
      dtype='object')

In [None]:
#### by calling %%R, everything in that block of code will be R code
#### by calling %R, only a single line will be R code
#### -i means input, watch spacing, no spaces allowed after listing first input
#### here we will create our data frame and test to see if it's valid

In [26]:
%%R -i df_imputed,df_colnames 

colnames(df_imputed) <- unlist(df_colnames);
print(is.data.frame(df_imputed))

[1] TRUE


In [None]:
#### the dataframe in pandas is recognized in R, summary in R will work on it natively

In [28]:
%%R -i df_imputed
print(summary(df_imputed))

      age           is_male         chest_pain   rest_blood_press
 Min.   :28.00   Min.   :0.0000   Min.   :1.00   Min.   :  0.0   
 1st Qu.:47.00   1st Qu.:1.0000   1st Qu.:3.00   1st Qu.:120.0   
 Median :54.00   Median :1.0000   Median :4.00   Median :130.0   
 Mean   :53.51   Mean   :0.7891   Mean   :3.25   Mean   :132.1   
 3rd Qu.:60.00   3rd Qu.:1.0000   3rd Qu.:4.00   3rd Qu.:140.0   
 Max.   :77.00   Max.   :1.0000   Max.   :4.00   Max.   :200.0   
  cholesterol    high_blood_sugar    rest_ecg      max_heart_rate  exer_angina
 Min.   :  0.0   0  :692          Min.   :0.0000   Min.   : 60.0   0  :528    
 1st Qu.:177.8   0.0: 90          1st Qu.:0.0000   1st Qu.:120.0   0.0: 55    
 Median :221.0   1  :138          Median :0.0000   Median :138.0   1  :337    
 Mean   :199.1                    Mean   :0.6033   Mean   :137.5              
 3rd Qu.:267.0                    3rd Qu.:1.0000   3rd Qu.:156.0              
 Max.   :603.0                    Max.   :2.0000   Max.   :202.0

In [None]:
#### %R = one line of code written in R

In [29]:
print 'original:', df_imputed.age.head()

# one line of r is %R, multiple is %%R
# set age to age *2 in R data frame, hopefully we can pass it back
# However, we cannot, memory is not synchronized from Python to R, simply copied into R workspace
%R -i df_imputed df_imputed$age <- df_imputed$age*2

# need to give a command to save memory, or save data back out into Python with -o (output)
# -i = input, -o = output
%R -i df_imputed -o df_imputed df_imputed$age <- df_imputed$age*2
# -o synchronizes the data between pandas and python making memory shared between two platforms
# now back in python
print 'after manipulation on R:', df_imputed.age.head()

original: 0    63
1    67
2    67
3    37
4    41
Name: age, dtype: int64
after manipulation on R: 0    126
1    134
2    134
3     74
4     82
Name: age, dtype: float64


In [30]:
# here is a basic example, where we input df_imputed and create a NEW dataframe out into Python
# that is equal to df_imputed
%R -i df_imputed -o df_from_R df_from_R <- df_imputed

df_from_R.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 920 entries, 0 to 919
Data columns (total 14 columns):
age                  920 non-null float64
is_male              920 non-null int32
chest_pain           920 non-null int32
rest_blood_press     920 non-null float64
cholesterol          920 non-null float64
high_blood_sugar     920 non-null int32
rest_ecg             920 non-null float64
max_heart_rate       920 non-null float64
exer_angina          920 non-null int32
ST_depression        920 non-null float64
Peak_ST_seg          920 non-null float64
major_vessels        920 non-null float64
thal                 920 non-null float64
has_heart_disease    920 non-null int32
dtypes: float64(9), int32(5)
memory usage: 89.8 KB


# THE END, GOOD JOB