In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
train = pd.read_csv('train_clean.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,total_fam,prefix
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,Mr.
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,Mrs.
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,Miss.
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,Mrs.
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,Mr.


In [4]:
train['Sex'] = train['Sex'].astype('category')
train['Pclass'] = train['Pclass'].astype('category')
train['Embarked'] = train['Embarked'].astype('category')
train['prefix'] = train['prefix'].astype('category')

In [5]:
train = pd.get_dummies(train, drop_first=True, columns=['Sex','Pclass','Embarked','prefix'])

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,total_fam,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S,prefix_Master.,prefix_Miss.,prefix_Mr.,prefix_Mrs.,prefix_Other
0,1,0,"Braund, Mr. Owen Harris",22.0,1,0,A/5 21171,7.25,,2,1,0,1,0,1,0,0,1,0,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,0,PC 17599,71.2833,C85,2,0,0,0,0,0,0,0,0,1,0
2,3,1,"Heikkinen, Miss. Laina",26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,1,0,1,0,1,0,0,0
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,0,113803,53.1,C123,2,0,0,0,0,1,0,0,0,1,0
4,5,0,"Allen, Mr. William Henry",35.0,0,0,373450,8.05,,1,1,0,1,0,1,0,0,1,0,0


In [7]:
train.drop(['PassengerId','Name','Ticket','Cabin','SibSp','Parch'], 1, inplace=True)

In [8]:
X = train.drop('Survived', 1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
c_space = np.linspace(0.0001, 10, 50)
param_grid1 = {'C': c_space, 'class_weight':[{0:.35,1:.65},{0:.4,1:.6},{0:.37,1:.63}]} 

In [11]:
svc = SVC(kernel='poly', degree=2)

svc_rs = RandomizedSearchCV(svc, param_grid1, cv=3, scoring='roc_auc', n_iter=12)

svc_rs.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=2, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=12, n_jobs=1,
          param_distributions={'C': array([  1.00000e-04,   2.04180e-01,   4.08259e-01,   6.12339e-01,
         8.16418e-01,   1.02050e+00,   1.22458e+00,   1.42866e+00,
         1.63274e+00,   1.83682e+00,   2.04090e+00,   2.24498e+00,
         2.44906e+00,   2.65313e+00,   2.85721e+00,   3.06129e+00,
      ...92e+00,   1.00000e+01]), 'class_weight': [{0: 0.35, 1: 0.65}, {0: 0.4, 1: 0.6}, {0: 0.37, 1: 0.63}]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='roc_auc', verbose=0)

In [12]:
pred2 = svc_rs.predict(X_test)
svc_rs.score(X_test, y_test)

0.85392535392535396

In [13]:
svc_rs.score(X_train, y_train)

0.86109990587602536

In [14]:
print(classification_report(y_test, pred2))

             precision    recall  f1-score   support

          0       0.84      0.83      0.83       105
          1       0.76      0.77      0.77        74

avg / total       0.80      0.80      0.80       179



In [15]:
confusion_matrix(y_test, pred2)

array([[87, 18],
       [17, 57]], dtype=int64)

In [16]:
roc_auc_score(y_test, pred2)

0.79942084942084934

In [17]:
test = pd.read_csv('test_clean.csv')

In [18]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,total_fam,prefix
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,Mr.
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2,Mrs.
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,Mr.
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,Mr.
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,3,Mrs.


In [19]:
ids = test['PassengerId']

In [20]:
test['Sex'] = test['Sex'].astype('category')
test['Pclass'] = test['Pclass'].astype('category')
test['Embarked'] = test['Embarked'].astype('category')
test['prefix'] = test['prefix'].astype('category')

In [21]:
test = pd.get_dummies(test, drop_first=True, columns=['Sex','Pclass','Embarked','prefix'])

In [22]:
test.drop(['PassengerId','Name','Ticket','Cabin','SibSp','Parch'], 1, inplace=True)

In [23]:
pred = svc_rs.predict(test)

In [24]:
prob_dict = {'PassengerId':ids, 'Survived':pred}
submission = pd.DataFrame(prob_dict)

In [25]:
submission.to_csv('basic_svc_submission3.csv', index=False)

In [26]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [27]:
max_depth = np.linspace(3,10)
param_grid2 = {'max_depth': max_depth, 'min_samples_split':[2,3,4], 'min_samples_leaf':[3,4,5], 'class_weight':[{0:.35,1:.65},{0:.4,1:.6},{0:.37,1:.63}]} 

In [28]:
rf = RandomForestClassifier(n_estimators=1000)

rf_rs = RandomizedSearchCV(rf, param_grid2, n_iter=25, cv=3, scoring='roc_auc')

rf_rs.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=25, n_jobs=1,
          param_distributions={'max_depth': array([  3.     ,   3.14286,   3.28571,   3.42857,   3.57143,   3.71429,
         3.85714,   4.     ,   4.14286,   4.28571,   4.42857,   4.57143,
         4.71429,   4.85714,   5.     ,   5.14286,   5.28571,   5.42857,
         5.57143,   5.71429,   5.85714,   6.   ...amples_leaf': [3, 4, 5], 'class_weight': [{0: 0.35, 1: 0.65}, {0: 0.4, 1: 0.6}, {0: 0.37, 1: 0.63}]},
          pre_dispatch='2*n_jobs', ra

In [29]:
pred2 = rf_rs.predict(X_test)
rf_rs.score(X_test, y_test)

0.9033462033462033

In [30]:
print(classification_report(y_test, pred2))

             precision    recall  f1-score   support

          0       0.84      0.88      0.86       105
          1       0.81      0.77      0.79        74

avg / total       0.83      0.83      0.83       179



In [31]:
confusion_matrix(y_test, pred2)

array([[92, 13],
       [17, 57]], dtype=int64)

In [32]:
roc_auc_score(y_test, pred2)

0.82323037323037318

In [33]:
pred4 = rf_rs.predict(test)

In [34]:
prob_dict = {'PassengerId':ids, 'Survived':pred4}
submission = pd.DataFrame(prob_dict)

In [35]:
submission.to_csv('rf_submission3.csv', index=False)

In [36]:
rf_rs.best_estimator_.feature_importances_

array([ 0.1071366 ,  0.16605222,  0.08173436,  0.18479554,  0.02126331,
        0.07558111,  0.00721212,  0.02099803,  0.01326601,  0.0506751 ,
        0.20065312,  0.06597998,  0.00465249])

In [37]:
test.columns

Index(['Age', 'Fare', 'total_fam', 'Sex_male', 'Pclass_2', 'Pclass_3',
       'Embarked_Q', 'Embarked_S', 'prefix_Master.', 'prefix_Miss.',
       'prefix_Mr.', 'prefix_Mrs.', 'prefix_Other'],
      dtype='object')

In [62]:
max_depth = range(3,11)
learning_rate = np.linspace(0.0001, 10, 100)
param_grid3 = {'max_depth': max_depth, 
               'learning_rate':learning_rate, 
               'max_features':['auto','sqrt','log2'],
              'min_samples_split':[2,3,4,5],
              'min_samples_leaf':[1,2,3,4],
              } 

In [63]:
gbc = GradientBoostingClassifier(n_estimators=1000)

gbc_rs = RandomizedSearchCV(gbc, param_grid3, n_iter=25, cv=3, scoring='roc_auc')

gbc_rs.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=1000,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=25, n_jobs=1,
          param_distributions={'max_depth': range(3, 11), 'learning_rate': array([  1.00000e-04,   1.01109e-01, ...,   9.89899e+00,   1.00000e+01]), 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_split': [2, 3, 4, 5], 'min_samples_leaf': [1, 2, 3, 4]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='roc_auc', verbose=0)

In [64]:
pred3 = gbc_rs.predict(X_test)
gbc_rs.score(X_test, y_test)

0.89073359073359071

In [68]:
gbc_rs.score(X_train, y_train)

0.99855452467392758

In [65]:
print(classification_report(y_test, pred2))

             precision    recall  f1-score   support

          0       0.84      0.88      0.86       105
          1       0.81      0.77      0.79        74

avg / total       0.83      0.83      0.83       179



In [66]:
confusion_matrix(y_test, pred2)

array([[92, 13],
       [17, 57]], dtype=int64)

In [67]:
roc_auc_score(y_test, pred2)

0.82323037323037318

In [73]:
pred5 = gbc_rs.predict(test)

In [74]:
prob_dict = {'PassengerId':ids, 'Survived':pred5}
submission = pd.DataFrame(prob_dict)

In [75]:
submission.to_csv('gbc_submission2.csv', index=False)

In [71]:
gbc_rs.best_estimator_.feature_importances_

array([ 0.37924425,  0.45230318,  0.04145502,  0.01945284,  0.01624895,
        0.02293295,  0.00716258,  0.02049109,  0.00186181,  0.00825941,
        0.02283034,  0.00645659,  0.001301  ])

In [72]:
test.columns

Index(['Age', 'Fare', 'total_fam', 'Sex_male', 'Pclass_2', 'Pclass_3',
       'Embarked_Q', 'Embarked_S', 'prefix_Master.', 'prefix_Miss.',
       'prefix_Mr.', 'prefix_Mrs.', 'prefix_Other'],
      dtype='object')

In [76]:
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "base_estimator__max_features": ['auto','sqrt','log2'],
              'base_estimator__min_samples_split':[2,3,4,5],
              'base_estimator__min_samples_leaf':[1,2,3,4],
              'base_estimator__class_weight':[{0:.35,1:.65},{0:.4,1:.6},{0:.37,1:.63}]
             }

dtc = DecisionTreeClassifier()

abc = AdaBoostClassifier(base_estimator = dtc, n_estimators=200)
abc_gs = GridSearchCV(abc, param_grid=param_grid, cv=3, scoring = 'roc_auc')
abc_gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=200, random_state=None),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'base_estimator__criterion': ['gini', 'entropy'], 'base_estimator__splitter': ['best', 'random'], 'base_estimator__max_features': ['auto', 'sqrt', 'log2'], 'base_estimator__min_samples_split': [2, 3, 4, 5], 'base_estimator__min_samples_leaf': [1, 2, 3, 4], 'base_estimator__class_weight': [{0: 0.35, 1: 0.65}, {0: 0.4, 1: 0.6}, {0: 0.37, 1: 0.63}]},
       pre_dispatch='2*n_jobs', refit=True, ret

In [77]:
pred6 = abc_gs.predict(X_test)
abc_gs.score(X_test, y_test)

0.89292149292149292

In [78]:
print(classification_report(y_test, pred2))

             precision    recall  f1-score   support

          0       0.84      0.88      0.86       105
          1       0.81      0.77      0.79        74

avg / total       0.83      0.83      0.83       179



In [79]:
confusion_matrix(y_test, pred2)

array([[92, 13],
       [17, 57]], dtype=int64)

In [80]:
roc_auc_score(y_test, pred2)

0.82323037323037318

In [81]:
abc_gs.best_estimator_.feature_importances_

  return self.tree_.compute_feature_importances()


array([             nan,              nan,   1.02548717e-01,
         7.75570837e+08,              nan,  -5.42899587e+08,
         1.55114167e+08,  -2.01648418e+09,  -1.48520830e-02,
         2.44955767e-01,  -8.79135387e-03,  -1.24091334e+09,
         9.94130747e-02])

In [82]:
test.columns

Index(['Age', 'Fare', 'total_fam', 'Sex_male', 'Pclass_2', 'Pclass_3',
       'Embarked_Q', 'Embarked_S', 'prefix_Master.', 'prefix_Miss.',
       'prefix_Mr.', 'prefix_Mrs.', 'prefix_Other'],
      dtype='object')

In [83]:
pred7 = abc_gs.predict(test)

In [84]:
prob_dict = {'PassengerId':ids, 'Survived':pred7}
submission = pd.DataFrame(prob_dict)

In [85]:
submission.to_csv('abc_submission1.csv', index=False)