Simple analysis of the Titanic data-set for the Kaggle competition: https://www.kaggle.com/c/titanic with 3 different algorithms:
- random forest 
- linear regression
- xgboost

**Best Score: 0.78469 ** 

In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.grid_search import GridSearchCV
import warnings
warnings.filterwarnings("ignore")


%matplotlib inline

In [None]:
ls

In [3]:
#reading files downloaded from https://www.kaggle.com/c/titanic/data
#desciption of the files is avaible at the before mentioned adress
test= pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

In [4]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [6]:
########################
# cleaning train dataset
########################


train["Age"] = train["Age"].fillna(train["Age"].median())
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1
train["Sex"]=train["Sex"].astype(float)

# Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna("S")

# Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2

train.Cabin=train.Cabin.fillna("0")
train.Cabin=train.Cabin.str[:1]

train.Cabin[train.Cabin=="A"]=1
train.Cabin[train.Cabin=="B"]=2
train.Cabin[train.Cabin=="C"]=3
train.Cabin[train.Cabin=="D"]=4
train.Cabin[train.Cabin=="E"]=5
train.Cabin[train.Cabin=="F"]=6
train.Cabin[train.Cabin=="G"]=7
train.Cabin[train.Cabin=="T"]=8
train.Cabin[train.Cabin=="0"]=0
train.Cabin=train.Cabin.astype(float)



train["family_size"] = train["SibSp"] + train["Parch"] + 1


In [7]:
###############################################################
#   cleaning test data and formatting strings to numbers     #
###############################################################

#print(test[test.Fare.isnull()==True].index[0])

test["Age"] = test["Age"].fillna(test["Age"].median())
test["Sex"][test["Sex"] == "male"] = 0
test["Sex"][test["Sex"] == "female"] = 1
test["Embarked"] = test["Embarked"].fillna("S")
test["Embarked"][test["Embarked"] == "S"] = 0
test["Embarked"][test["Embarked"] == "C"] = 1
test["Embarked"][test["Embarked"] == "Q"] = 2

test["Fare"][test[test.Fare.isnull()==True].index[0]] = test.Fare.median()

test.Cabin=test.Cabin.fillna("0")
test.Cabin=test.Cabin.str[:1]

test.Cabin[test.Cabin=="A"]=1
test.Cabin[test.Cabin=="B"]=2
test.Cabin[test.Cabin=="C"]=3
test.Cabin[test.Cabin=="D"]=4
test.Cabin[test.Cabin=="E"]=5
test.Cabin[test.Cabin=="F"]=6
test.Cabin[test.Cabin=="G"]=7
test.Cabin[test.Cabin=="T"]=8
test.Cabin[test.Cabin=="0"]=0
test.Cabin=test.Cabin.astype(float)

test["family_size"] = test["SibSp"] + test["Parch"] + 1

In [8]:
###################################################################
# Creating target and features numpy arrays: target, features_one #
###################################################################
list_columns= ["Pclass", "Sex", "Age", "family_size", "Cabin", "Fare", "SibSp", "Parch", "Embarked"]

target_dtc = train["Survived"].values
features_dtc = train[list_columns].values

test_features = test[list_columns].values
PassengerId =np.array(test["PassengerId"]).astype(int)

In [10]:
features_dtc

array([[3, 0.0, 22.0, ..., 1, 0, 0],
       [1, 1.0, 38.0, ..., 1, 0, 1],
       [3, 1.0, 26.0, ..., 0, 0, 0],
       ..., 
       [3, 1.0, 28.0, ..., 1, 2, 0],
       [1, 0.0, 26.0, ..., 0, 0, 1],
       [3, 0.0, 32.0, ..., 0, 0, 2]], dtype=object)

In [None]:
###############################################################
#               Random Forest Classifier                      #
###############################################################

decision_rf = RandomForestClassifier(n_estimators=100,
                                     min_samples_leaf=5,
                                     max_features='auto',
                                     oob_score=True,
                                     random_state=42,
                                     n_jobs=-1
                                     )


fitting_rf = decision_rf.fit(features_dtc, target_dtc)

# Look at the importance and score of the included features
feature_importances = pd.Series(decision_rf.feature_importances_, index=list_columns)
feature_importances.sort()
feature_importances.plot(kind="barh")
plt.show()

#print("RF feature importance")
#for index,column in enumerate(list_columns):
#    print(column + ": " + str(decision_rf.feature_importances_[index]))

In [None]:
#score of the training dataset
print("score of the Random Forest on the training dataset is " + str(decision_rf.score(features_dtc, target_dtc)))

prediction_rf = decision_rf.predict(test_features)
solution_rf = pd.DataFrame(prediction_rf, PassengerId, columns = ["Survived"])
solution_rf.to_csv("solution_rf.csv", index_label = ["PassengerId"])

This solution had a score on kaggle's titanic submission: **  0.78469 **

In [None]:
###############################################################
#               Logistic Regression                           #
###############################################################

result=[]
c_value =[0.01,0.1,1,10,1e2,1e3]
for value in c_value:
    decision_lr = LogisticRegression(C=value
                                     ,random_state=0
                                     ,solver='liblinear'
                                    )

    fitting_lr = decision_lr.fit(features_dtc, target_dtc)
    result.append(decision_lr.score(features_dtc, target_dtc))

results = pd.Series(result, index=c_value).plot(logx=True)
plt.show()


In [None]:
#score of the training dataset

print("score of the Logistic regression on the training dataset is " + str(decision_lr.score(features_dtc, target_dtc)))

prediction_rf = decision_lr.predict(test_features)
solution_rf = pd.DataFrame(prediction_rf, PassengerId, columns = ["Survived"])
solution_rf.to_csv("solution_logistic_regression.csv", index_label = ["PassengerId"])


This solution had a score on kaggle's titanic submission: **  0.75120 **


In [None]:
## XGBOOST ##
xgb_model = xgb.XGBClassifier()
                         
xgb_model.fit(features_dtc, target_dtc)

In [None]:
# Look at the importance and score of the included features
#feature_importances = pd.Series(xgb_model.feature_importances_, index=list_columns)
#feature_importances.sort()
#feature_importances.plot(kind="barh")
#plt.show()

In [None]:
print("score of the XGB model on the training dataset is " + str(xgb_model.score(features_dtc, target_dtc)))


prediction_xgb = xgb_model.predict(test_features)
solution_rf = pd.DataFrame(prediction_xgb, PassengerId, columns = ["Survived"])
solution_rf.to_csv("solution_xbg_model.csv", index_label = ["PassengerId"])


This solution had a score on kaggle's titanic submission: **  0.76077 **

In [None]:
## XGBOOST  OPTIMIZATION##


cv_params = {'max_depth': [1,3,5]
             ,'min_child_weight': [1,3,6]
             ,'n_estimators': [10, 100, 1000]
             ,'learning_rate' : [0.01, 0.05,0.1,0.5]
             }

ind_params ={'seed':42, 
             #'learning_rate': 0.05, 
             'subsample': 0.6, 
             'colsample_bytree': 0.6, 
             'objective': 'binary:logistic'
             #,'n_estimators' : 1000
             ,'max_depth': 5
             #,'min_child_weight': 3
            }   

optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1) 


In [None]:
optimized_GBM.fit(features_dtc, target_dtc)


In [None]:
print(optimized_GBM.best_score_)


In [None]:
print(optimized_GBM.best_params_)

In [None]:
## XGBOOST ##
xgb_model_opt = xgb.XGBClassifier(seed=42 
                                 ,learning_rate= 0.05 
                                 ,subsample= 0.8
                                 ,colsample_bytree= 0.8 
                                 ,objective = 'binary:logistic'
                                 ,n_estimators = 1000
                                 ,max_depth = 3
                                 ,min_child_weight= 2
                                 )
                         
xgb_model_opt.fit(features_dtc, target_dtc)

In [None]:
print("score of the optimized XGB model on the training dataset is " + str(xgb_model_opt.score(features_dtc, target_dtc)))


prediction_xgb = xgb_model_opt.predict(test_features)
solution_rf = pd.DataFrame(prediction_xgb, PassengerId, columns = ["Survived"])
solution_rf.to_csv("solution_xbg_opt_model.csv", index_label = ["PassengerId"])

This solution had a score on kaggle's titanic submission: ** 0.68421 **
