# Appendix: How to build a predictive model when two separate datasets (train & test) are given to you in a data analytics competition

Data analytics competitions often provide not a single dataset but two separate datasets (train.csv, test.csv). This Ipython notebook demonstrates how to process two separate datasets (train.csv and test.csv) for model building, validation and evaluation. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#import decisiontreeclassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
#import logisticregression classifier
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
#import knn classifier
from sklearn.neighbors import KNeighborsClassifier

#for validating your classification model
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

# feature selection
from sklearn.feature_selection import RFE
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Loading data

In [2]:
df=pd.read_csv("data/heartattack_train.csv")
print df.head()

   Age  Marital_Status  Gender  Weight_Category  Cholesterol  \
0   60               2       0                1          150   
1   69               2       1                1          170   
2   52               1       0                0          174   
3   66               2       1                1          169   
4   70               3       0                1          237   

   Stress_Management  Trait_Anxiety 2nd_Heart_Attack  
0                  1             50              Yes  
1                  0             60              Yes  
2                  1             35               No  
3                  0             60              Yes  
4                  0             65              Yes  


In [3]:
df.info()
#no missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 8 columns):
Age                  138 non-null int64
Marital_Status       138 non-null int64
Gender               138 non-null int64
Weight_Category      138 non-null int64
Cholesterol          138 non-null int64
Stress_Management    138 non-null int64
Trait_Anxiety        138 non-null int64
2nd_Heart_Attack     138 non-null object
dtypes: int64(7), object(1)
memory usage: 8.7+ KB


# Data wrangling & ETL: Data cleaningg & transformation

In [4]:
#mappling or replacing
df = df.replace({'2nd_Heart_Attack': 'No'}, {'2nd_Heart_Attack': '0'})
df = df.replace({'2nd_Heart_Attack': 'Yes'}, {'2nd_Heart_Attack': '1'})

In [5]:
df.info()
#2nd heart attack is still treated as object or string

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 8 columns):
Age                  138 non-null int64
Marital_Status       138 non-null int64
Gender               138 non-null int64
Weight_Category      138 non-null int64
Cholesterol          138 non-null int64
Stress_Management    138 non-null int64
Trait_Anxiety        138 non-null int64
2nd_Heart_Attack     138 non-null object
dtypes: int64(7), object(1)
memory usage: 8.7+ KB


In [6]:
#or you can do this to convert object to number
df['2nd_Heart_Attack'] = df['2nd_Heart_Attack'].astype(int)
print df.dtypes

Age                  int64
Marital_Status       int64
Gender               int64
Weight_Category      int64
Cholesterol          int64
Stress_Management    int64
Trait_Anxiety        int64
2nd_Heart_Attack     int32
dtype: object


In [7]:
df = df.convert_objects(convert_numeric=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 8 columns):
Age                  138 non-null int64
Marital_Status       138 non-null int64
Gender               138 non-null int64
Weight_Category      138 non-null int64
Cholesterol          138 non-null int64
Stress_Management    138 non-null int64
Trait_Anxiety        138 non-null int64
2nd_Heart_Attack     138 non-null int32
dtypes: int32(1), int64(7)
memory usage: 8.2 KB


  if __name__ == '__main__':


# Model Building, Validation, Evaluation

In [8]:
# declare X variables and y variable
y = df['2nd_Heart_Attack']
X = df.drop(['2nd_Heart_Attack'], axis=1)

## Decision tree with Split Validation

In [9]:
# evaluate the model by splitting into train (70%) and test sets (30%)
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [10]:
#Model evaluation
# http://scikit-learn.org/stable/modules/model_evaluation.html
print metrics.accuracy_score(y_test, dt.predict(X_test))
print "--------------------------------------------------------"
print metrics.confusion_matrix(y_test, dt.predict(X_test)) 
print "--------------------------------------------------------"
print metrics.classification_report(y_test, dt.predict(X_test))
print "--------------------------------------------------------"
print metrics.roc_auc_score(y_test, dt.predict(X_test))

# y-test is the acual y value in the testing dataset
# dt.predict(X_test) is the y value generated by your model
# If they are same, we can say your model is accurate.

0.928571428571
--------------------------------------------------------
[[18  1]
 [ 2 21]]
--------------------------------------------------------
             precision    recall  f1-score   support

          0       0.90      0.95      0.92        19
          1       0.95      0.91      0.93        23

avg / total       0.93      0.93      0.93        42

--------------------------------------------------------
0.930205949657


> Above, we asked Sklearn to split the original dataset into train dataset (70%) and test dataset (30%).

> What if you are provided with two separate datasets (train, test). Consider the example below.

## When two separate datasets (train, test or validation) are provided

In [11]:
train = pd.read_csv("data/heartattack_train_dataset.csv")
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96 entries, 0 to 95
Data columns (total 8 columns):
Age                  96 non-null int64
Marital_Status       96 non-null int64
Gender               96 non-null int64
Weight_Category      96 non-null int64
Cholesterol          96 non-null int64
Stress_Management    96 non-null int64
Trait_Anxiety        96 non-null int64
2nd_Heart_Attack     96 non-null int64
dtypes: int64(8)
memory usage: 6.1 KB


In [12]:
test = pd.read_csv("data/heartattack_test_dataset.csv")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 8 columns):
Age                  42 non-null int64
Marital_Status       42 non-null int64
Gender               42 non-null int64
Weight_Category      42 non-null int64
Cholesterol          42 non-null int64
Stress_Management    42 non-null int64
Trait_Anxiety        42 non-null int64
2nd_Heart_Attack     42 non-null int64
dtypes: int64(8)
memory usage: 2.7 KB


In [13]:
y_train = train['2nd_Heart_Attack']
y_test = test['2nd_Heart_Attack']
X_train = train.drop(['2nd_Heart_Attack'], axis=1)
X_test = test.drop(['2nd_Heart_Attack'], axis=1)

In [14]:
# Build a decision model with max_depth 3 and min_samples_leaf 5
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
dt2 = DecisionTreeClassifier()
dt2.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [15]:
# Find out the performance of this model & interpret the results
print metrics.accuracy_score(y_test, dt2.predict(X_test))
print "--------------------------------------------------------"
print metrics.confusion_matrix(y_test, dt2.predict(X_test)) 
print "--------------------------------------------------------"
print metrics.classification_report(y_test, dt2.predict(X_test))
print "--------------------------------------------------------"
print metrics.roc_auc_score(y_test, dt2.predict(X_test))

0.928571428571
--------------------------------------------------------
[[18  1]
 [ 2 21]]
--------------------------------------------------------
             precision    recall  f1-score   support

          0       0.90      0.95      0.92        19
          1       0.95      0.91      0.93        23

avg / total       0.93      0.93      0.93        42

--------------------------------------------------------
0.930205949657


# Model Deployement: Make Predictions on the new dataset (scoring dataset)

In [16]:
# scoring dataset
score=pd.read_csv("data/heartattack_scoring.csv")
score.head(2)
#no Y value in this dataset ... 
#we are trying to predict whether the people in this scoring dataset are likely to have 2nd heart attack or not.

Unnamed: 0,Age,Marital_Status,Gender,Weight_Category,Cholesterol,Stress_Management,Trait_Anxiety
0,61,0,1,1,139,1,50
1,55,2,1,2,163,0,40


In [17]:
score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 7 columns):
Age                  690 non-null int64
Marital_Status       690 non-null int64
Gender               690 non-null int64
Weight_Category      690 non-null int64
Cholesterol          690 non-null int64
Stress_Management    690 non-null int64
Trait_Anxiety        690 non-null int64
dtypes: int64(7)
memory usage: 37.8 KB


In [18]:
predictedY = dt2.predict(score)
print predictedY

[0 1 0 1 1 0 0 0 1 0 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 1 1 1 1 0 1 0 0 0 0 1 1
 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1 0 1 0 0 0 1 1
 0 1 0 0 0 0 1 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 0 0
 1 0 1 0 1 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 0 0 1 1 1 1 1 1 1 0 1 0 0 0
 1 0 0 1 1 1 0 1 1 0 1 1 1 0 0 1 0 0 1 1 0 1 1 0 0 0 0 0 0 0 1 1 1 1 0 1 1
 0 1 1 1 0 1 1 0 0 1 0 1 1 1 0 0 0 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 1 1 1 0 1
 0 0 0 1 1 0 0 0 0 1 0 1 1 1 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 1 0
 0 0 1 0 1 1 1 1 0 1 0 1 1 1 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1
 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 1 0 0 1 0 1 1 0 0 0 1 1 0 0 0 0 0 1 1 1 0 0
 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 1 1 1 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0
 1 0 0 0 0 1 1 1 0 1 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 0 0 0 1 1 0 1 1 0 0 0 0
 1 1 0 0 0 1 1 1 0 1 0 1 1 1 0 0 0 1 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 0
 1 1 1 0 1 1 0 1 0 1 1 0 

In [19]:
#combine the predicted Y value with the scoring dataset
predictedY = pd.DataFrame(predictedY, columns=['predicted Y'])
predictedY.head()

Unnamed: 0,predicted Y
0,0
1,1
2,0
3,1
4,1


In [20]:
#finally ...
data1 = score.join(predictedY) 
data1.head()

Unnamed: 0,Age,Marital_Status,Gender,Weight_Category,Cholesterol,Stress_Management,Trait_Anxiety,predicted Y
0,61,0,1,1,139,1,50,0
1,55,2,1,2,163,0,40,1
2,53,1,1,1,172,0,55,0
3,58,1,1,2,206,0,70,1
4,62,2,1,1,148,1,50,1


2nd person (a 55 years old male) is likely to have 2nd heart attack

In [21]:
data1.to_csv("data/output_risky_patients2.csv")