In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
train = pd.read_csv('train_clean.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,total_fam,travel_companion
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,With Family
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,With Family
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,With Family
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,


In [4]:
train['Sex'] = train['Sex'].astype('category')
train['Pclass'] = train['Pclass'].astype('category')
train['Embarked'] = train['Embarked'].astype('category')
train['travel_companion'] = train['travel_companion'].astype('category')

In [5]:
train = pd.get_dummies(train, drop_first=True, columns=['Sex','Pclass','Embarked'])

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,total_fam,travel_companion,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,1,With Family,1,0,1,0,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,1,With Family,0,0,0,0,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,0,,0,0,1,0,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,1,With Family,0,0,0,0,1
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,0,,1,0,1,0,1


In [7]:
train.drop(['PassengerId','Name','Ticket','Cabin','travel_companion','total_fam'], 1, inplace=True)

In [8]:
X = train.drop('Survived', 1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [9]:
c_space = np.linspace(0.0001, 20, 20)
param_grid1 = {'C': c_space} 

In [10]:
svc = SVC()

svc_rs = GridSearchCV(svc, param_grid1, cv=3,scoring='roc_auc')

svc_rs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': array([  1.00000e-04,   1.05273e+00,   2.10535e+00,   3.15798e+00,
         4.21061e+00,   5.26323e+00,   6.31586e+00,   7.36848e+00,
         8.42111e+00,   9.47374e+00,   1.05264e+01,   1.15790e+01,
         1.26316e+01,   1.36842e+01,   1.47369e+01,   1.57895e+01,
         1.68421e+01,   1.78947e+01,   1.89474e+01,   2.00000e+01])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=0)

In [11]:
pred2 = svc_rs.predict(X_test)
svc_rs.score(X_test, y_test)

0.78525909776957914

In [12]:
print(classification_report(y_test, pred2))

             precision    recall  f1-score   support

          0       0.77      0.81      0.79       134
          1       0.68      0.63      0.65        89

avg / total       0.73      0.74      0.73       223



In [13]:
confusion_matrix(y_test, pred2)

array([[108,  26],
       [ 33,  56]], dtype=int64)

In [14]:
roc_auc_score(y_test, pred2)

0.71759181619989942

In [15]:
max_depth = np.linspace(3,15)
param_grid2 = {'max_depth': max_depth, 'min_samples_split':[5,10,15], 'min_samples_leaf':[5,10,15], 'class_weight':[{0:.35,1:.65},{0:.4,1:.6},{0:.37,1:.63}]} 

In [16]:
rf = RandomForestClassifier(n_estimators=1000)

rf_rs = RandomizedSearchCV(rf, param_grid2, n_iter=25, cv=3, scoring='roc_auc')

rf_rs.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=25, n_jobs=1,
          param_distributions={'max_depth': array([  3.     ,   3.2449 ,   3.4898 ,   3.73469,   3.97959,   4.22449,
         4.46939,   4.71429,   4.95918,   5.20408,   5.44898,   5.69388,
         5.93878,   6.18367,   6.42857,   6.67347,   6.91837,   7.16327,
         7.40816,   7.65306,   7.89796,   8.142...ples_leaf': [5, 10, 15], 'class_weight': [{0: 0.35, 1: 0.65}, {0: 0.4, 1: 0.6}, {0: 0.37, 1: 0.63}]},
          pre_dispatch='2*n_jobs', ra

In [17]:
pred2 = rf_rs.predict(X_test)
rf_rs.score(X_test, y_test)

0.88617306724802958

In [18]:
print(classification_report(y_test, pred2))

             precision    recall  f1-score   support

          0       0.87      0.82      0.85       134
          1       0.75      0.82      0.78        89

avg / total       0.82      0.82      0.82       223



In [19]:
confusion_matrix(y_test, pred2)

array([[110,  24],
       [ 16,  73]], dtype=int64)

In [20]:
roc_auc_score(y_test, pred2)

0.82056012074459173

In [21]:
rf_rs.best_params_

{'class_weight': {0: 0.35, 1: 0.65},
 'max_depth': 10.346938775510203,
 'min_samples_leaf': 5,
 'min_samples_split': 10}

In [32]:
test = pd.read_csv('test_clean.csv')

In [33]:
ids = test['PassengerId']

In [34]:
test['Sex'] = test['Sex'].astype('category')
test['Pclass'] = test['Pclass'].astype('category')
test['Embarked'] = test['Embarked'].astype('category')
test['travel_companion'] = test['travel_companion'].astype('category')

In [35]:
test = pd.get_dummies(test, drop_first=True, columns=['Sex','Pclass','Embarked'])

In [36]:
test.drop(['PassengerId','Name','Ticket','Cabin','travel_companion','total_fam'], 1, inplace=True)

In [37]:
test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,34.5,0,0,7.8292,1,0,1,1,0
1,47.0,1,0,7.0,0,0,1,0,1
2,62.0,0,0,9.6875,1,1,0,1,0
3,27.0,0,0,8.6625,1,0,1,0,1
4,22.0,1,1,12.2875,0,0,1,0,1


In [38]:
pred3 = rf_rs.predict(test)

In [39]:
prob_dict = {'PassengerId':ids, 'Survived':pred3}
submission = pd.DataFrame(prob_dict)

In [40]:
submission.to_csv('basic_rf_submission4.csv', index=False)

In [41]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
