In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics

In [2]:
titanic=pd.read_csv("train.csv")

In [3]:
titanic["Age"]= titanic["Age"].fillna(titanic["Age"].median())

In [4]:
titanic.loc[titanic["Sex"]=="male" , "Sex"] = 0

In [5]:
titanic.loc[titanic["Sex"]== "female","Sex"] =1
titanic["Embarked"]= titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"]== "S","Embarked"] =0
titanic.loc[titanic["Embarked"]== "C","Embarked"] =1
titanic.loc[titanic["Embarked"]== "Q","Embarked"] =2

In [6]:
def classification_model(model, data, predictors, outcome):
    #Fit the model:
    mod = model.fit(data[predictors],data[outcome])
  
    #Make predictions on training set:
    predictions = model.predict(data[predictors])
  
    #Print accuracy
    accuracy = metrics.accuracy_score(predictions,data[outcome])
    print "Accuracy : %s" % "{0:.3%}".format(accuracy)

    #Perform k-fold cross-validation with 5 folds
    kf = KFold(data.shape[0], n_folds=5)
    error = []
    for train, test in kf:
        # Filter training data
        train_predictors = (data[predictors].iloc[train,:])
    
        # The target we're using to train the algorithm.
        train_target = data[outcome].iloc[train]
    
        # Training the algorithm using the predictors and target.
        model.fit(train_predictors, train_target)
    
        #Record error from each cross-validation run
        error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
    #print predictions
   
    print "Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error))

    #Fit the model again so that it can be refered outside the function:
    model.fit(data[predictors],data[outcome]) 
    
    #now making submission based on test file updated 6/6/16
    submission_dummy = pd.DataFrame({
        "PassengerId": titanic["PassengerId"],
        "Survived": predictions
            })
    submission_dummy.to_csv("kaggle_dummy.csv", index=False)
    #print submission
    #submission.to_csv("kaggle.csv", index=False)

In [7]:
outcome_var = 'Survived'
model = LogisticRegression()
predictor_var = ['Age']
classification_model(model, titanic,predictor_var,outcome_var)

Accuracy : 61.616%
Cross-Validation Score : 61.610%


In [8]:
outcome_var = 'Survived'
model = LogisticRegression()
predictor_var = ['Sex']
classification_model(model, titanic,predictor_var,outcome_var)

Accuracy : 78.676%
Cross-Validation Score : 78.672%


In [9]:
outcome_var = 'Survived'
model = LogisticRegression()
predictor_var = ['Sex',  'Fare', 'Age']
classification_model(model, titanic,predictor_var,outcome_var)

Accuracy : 78.339%
Cross-Validation Score : 78.224%


In [10]:
model.coef_

array([[ 2.31530086,  0.01156839, -0.01149689]])

In [11]:
#decision trees with default val
model = DecisionTreeClassifier()
predictor_var= ['Sex', 'Fare', 'Age', 'Pclass','SibSp']
classification_model(model, titanic, predictor_var, outcome_var)
model.feature_importances_

Accuracy : 97.868%
Cross-Validation Score : 78.458%


array([ 0.31194178,  0.29254832,  0.24140268,  0.10638014,  0.04772708])

In [12]:
#decision trees with tuning
model = DecisionTreeClassifier(max_depth=6, min_samples_split=10, min_samples_leaf=4)
predictor_var= ['Sex', 'Fare', 'Age', 'Pclass','SibSp']
classification_model(model, titanic, predictor_var, outcome_var)
model.feature_importances_

Accuracy : 85.859%
Cross-Validation Score : 82.828%


array([ 0.53309575,  0.1369727 ,  0.0948489 ,  0.18100364,  0.05407901])

In [36]:
model = RandomForestClassifier(n_estimators=50, min_samples_split=20, min_samples_leaf=6)
predictor_var = ['Sex', 'Fare', 'Pclass']
classification_model(model, titanic, predictor_var, outcome_var)

Accuracy : 84.175%
Cross-Validation Score : 79.688%


In [43]:
from sklearn.ensemble import   GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=100,min_samples_split=6, min_samples_leaf=4,max_depth=6)
predictor_var = ['Sex', 'Fare', 'Pclass']
classification_model(model, titanic, predictor_var, outcome_var)

Accuracy : 89.001%
Cross-Validation Score : 80.698%


In [37]:
titanic_test = pd.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

IOError: File test.csv does not exist

In [None]:
model = DecisionTreeClassifier()
predictor_var= ['Sex', 'Age','Fare']
#predictor_var= ['Fare']
classification_model(model, titanic, predictor_var, outcome_var)
model.feature_importances_
outcome_var = 'Survived'


predictions = model.predict(titanic_test[predictor_var])
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
          })
#print submission
#print predictions
submission.to_csv("kaggle_final_titanic.csv", index=False)



In [None]:
model = RandomForestClassifier(n_estimators=100)
predictor_var = ['Sex', 'Fare', 'Pclass', 'Age']
classification_model(model, titanic, predictor_var, outcome_var)
outcome_var = 'Survived'
                     
predictions = model.predict(titanic_test[predictor_var])
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
          })
#print submission
#print predictions
submission.to_csv("kaggle_final_titanic.csv", index=False)

In [None]:
from StringIO import StringIO
from sklearn import tree
import pydot
clf = model.fit(titanic[predictor_var],titanic[outcome_var])
#clf = DecisionTreeClassifier.fit(titanic[predictor_var],titanic[outcome_var])
out = StringIO()
out = tree.export_graphviz(clf, out_file=out)
clf.tree_.children_left

out = StringIO()
tree.export_graphviz(clf, out_file = out)
graph = pydot.graph_from_dot_data(out.getvalue()) 
#graph[0].write_pdf("graph.pdf") 
pydot.graph_from_dot_data(out.getvalue())[0].write_png("abc.png")
i = misc.imread("abc.png")
plt.imshow(i)
