In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold   #For K-fold cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
import pydot
from IPython.display import Image

In [2]:
titanic=pd.read_csv("train.csv")
titanic[:5]
titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [3]:
titanic["Age"]= titanic["Age"].fillna(titanic["Age"].median())

In [4]:
titanic.loc[titanic["Sex"]=="male" , "Sex"] = 0

In [5]:
titanic.loc[titanic["Sex"]== "female","Sex"] =1
titanic["Embarked"]= titanic["Embarked"].fillna("S")
titanic.loc[titanic["Embarked"]== "S","Embarked"] =0
titanic.loc[titanic["Embarked"]== "C","Embarked"] =1
titanic.loc[titanic["Embarked"]== "Q","Embarked"] =2

In [6]:
def classification_model(model, data, predictors, outcome):
    #Fit the model:
    mod = model.fit(data[predictors],data[outcome])
  
    #Make predictions on training set:
    predictions = model.predict(data[predictors])
  
    #Print accuracy
    accuracy = metrics.accuracy_score(predictions,data[outcome])
    print "Accuracy : %s" % "{0:.3%}".format(accuracy)
    
    
    
    precision = metrics.precision_recall_fscore_support(data[outcome], predictions, average = 'binary', warn_for=('precision', 'recall', 'f-score') )
    print(precision)
    
    
    
    #Perform k-fold cross-validation with 5 folds
    kf = KFold(data.shape[0], n_folds=5)
    error = []
    for train, test in kf:
        # Filter training data
        train_predictors = (data[predictors].iloc[train,:])
    
        # The target we're using to train the algorithm.
        train_target = data[outcome].iloc[train]
    
        # Training the algorithm using the predictors and target.
        model.fit(train_predictors, train_target)
    
        #Record error from each cross-validation run
        error.append(model.score(data[predictors].iloc[test,:], data[outcome].iloc[test]))
    #print predictions
   
    print "Cross-Validation Score : %s" % "{0:.3%}".format(np.mean(error))

    #Fit the model again so that it can be refered outside the function:
    model.fit(data[predictors],data[outcome]) 
    
    #now making submission based on test file updated 6/6/16
    submission_dummy = pd.DataFrame({
        "PassengerId": titanic["PassengerId"],
        "Survived": predictions
            })
    submission_dummy.to_csv("kaggle_dummy.csv", index=False)
    #print submission
    #submission.to_csv("kaggle.csv", index=False)

In [10]:
outcome_var = 'Survived'
model = LogisticRegression()
predictor_var = ['Age']
classification_model(model, titanic,predictor_var,outcome_var)
titanic['Survived'].shape

Accuracy : 61.616%
(0.0, 0.0, 0.0, None)
Cross-Validation Score : 61.610%


(891,)

In [9]:
outcome_var = 'Survived'
model = LogisticRegression()
predictor_var = ['Sex','Fare','Age']
classification_model(model, titanic,predictor_var,outcome_var)

Accuracy : 78.339%
(0.73065015479876161, 0.6900584795321637, 0.7097744360902255, None)
Cross-Validation Score : 78.224%


In [10]:
outcome_var = 'Survived'
model = LogisticRegression()
predictor_var = ['Sex',  'Fare', 'Age']
classification_model(model, titanic,predictor_var,outcome_var)

Accuracy : 78.339%
(0.73065015479876161, 0.6900584795321637, 0.7097744360902255, None)
Cross-Validation Score : 78.224%


In [11]:
model.coef_

array([[ 2.31530086,  0.01156839, -0.01149689]])

In [12]:
#decision trees with default val
model = DecisionTreeClassifier()
predictor_var= ['Sex', 'Fare', 'Age', 'Pclass','SibSp']
classification_model(model, titanic, predictor_var, outcome_var)
model.feature_importances_

Accuracy : 97.868%
(0.99088145896656532, 0.95321637426900585, 0.97168405365126675, None)
Cross-Validation Score : 78.346%


array([ 0.31194178,  0.28830589,  0.2375211 ,  0.11101817,  0.05121306])

In [13]:
#decision trees with tuning
model = DecisionTreeClassifier(max_depth= 6, min_samples_split=10, min_samples_leaf=4)
predictor_var= ['Sex', 'Fare', 'Age', 'Pclass','SibSp','Embarked']
classification_model(model, titanic, predictor_var, outcome_var)
import sklearn
import sklearn.tree
clf = DecisionTreeClassifier(max_depth= 6, min_samples_split=10, min_samples_leaf=4)
clf.fit(titanic[predictor_var], titanic[outcome_var])
predictions = clf.predict(titanic[predictor_var])

with open('tree_2'+'.dot','w') as dotfile:
    sklearn.tree.export_graphviz(clf,dotfile,filled=True, rounded=True)
(graph,) = pydot.graph_from_dot_file('tree_2.dot')
graph.write_png('ans.png')

Image(filename='ans.png')

Accuracy : 85.522%
(0.87632508833922262, 0.72514619883040932, 0.79359999999999997, None)
Cross-Validation Score : 83.166%


NameError: name 'data' is not defined

In [None]:
model = RandomForestClassifier(n_estimators=100,max_depth=10, min_samples_split=4, min_samples_leaf=3)
predictor_var= ['Sex', 'Fare', 'Age', 'Pclass','SibSp','Embarked']
classification_model(model, titanic, predictor_var, outcome_var)

In [None]:
from sklearn.ensemble import   GradientBoostingClassifier
model = GradientBoostingClassifier(n_estimators=25, min_samples_split= 4, min_samples_leaf= 2, max_depth= 3)
predictor_var = ['Sex', 'Fare','Fare','Pclass','SibSp']
classification_model(model, titanic, predictor_var, outcome_var)

In [None]:
titanic_test = pd.read_csv("test.csv")
titanic_test["Age"] = titanic_test["Age"].fillna(titanic["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0 
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1
titanic_test["Embarked"] = titanic_test["Embarked"].fillna("S")

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2
model = DecisionTreeClassifier(max_depth= 6, min_samples_split=10, min_samples_leaf=4)
predictor_var= ['Sex', 'Fare', 'Age', 'Pclass','SibSp','Embarked']
outcome_var = 'Survived'
classification_model(model, titanic, predictor_var, outcome_var)

                     
predictions = model.predict(titanic_test[predictor_var])
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
          })
#print submission
#print predictions
submission.to_csv("kaggle_final_titanic.csv", index=False)

In [None]:
#decision trees with tuning
model = DecisionTreeClassifier(max_depth= 6, min_samples_split=10, min_samples_leaf=4)
predictor_var= ['Sex', 'Fare', 'Age', 'Pclass','SibSp','Embarked']
classification_model(model, titanic, predictor_var, outcome_var)
import sklearn
import sklearn.tree
clf = DecisionTreeClassifier(max_depth= 6, min_samples_split=10, min_samples_leaf=4)
clf.fit(titanic[predictor_var], titanic[outcome_var])
predictions = clf.predict(titanic[predictor_var])



d = {'predicted': predictions, 'actual': titanic['Survived'], 'name': titanic['Name']}
df = pd.DataFrame(data=d)

In [None]:
#if df['predicted'][2] != df['actual'][2]:
#    print(df['name'])
for i in range(0, len(df)):

    if df['actual'][i] != df['predicted'][i]:
        print(df['name'][i])
        

In [19]:
titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object