In [2]:
import pandas as pd

In [3]:
data = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_ids = test["PassengerId"]

In [4]:
def clean(data):
    data = data.drop(["Ticket", "Cabin","Name","PassengerId"], axis=1)

    cols = ["SibSp","Parch","Fare","Age"]
    for col in cols:
        data[col].fillna(data[col].median(), inplace=True)

    data.Embarked.fillna("U",inplace=True)
    return data

data = clean(data)
test = clean(test)

In [5]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

cols = ["Sex", "Embarked"]

for col in cols:
    data[col] = le.fit_transform(data[col])
    test[col] = le.transform(test[col])
    print(le.classes_)

data.head()

['female' 'male']
['C' 'Q' 'S' 'U']


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [7]:
from random import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

y = data["Survived"]

X = data.drop("Survived", axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)

In [9]:
preds = clf.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
accuracy_score(y_test, preds)

0.8100558659217877

In [13]:
confusion_matrix(y_test, preds)

array([[90, 15],
       [19, 55]])

In [14]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [16]:
clf.predict_proba(X_test)[0:10]

array([[0.88720817, 0.11279183],
       [0.77782765, 0.22217235],
       [0.86481186, 0.13518814],
       [0.11394579, 0.88605421],
       [0.25654824, 0.74345176],
       [0.06891557, 0.93108443],
       [0.3293608 , 0.6706392 ],
       [0.9086633 , 0.0913367 ],
       [0.25422371, 0.74577629],
       [0.07895431, 0.92104569]])

In [27]:
submission_preds = clf.predict(test)

In [29]:
df = pd.DataFrame({"PassengerId": test_ids.values,
"Survived": submission_preds,
})

In [30]:
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [31]:
df.to_csv("submission.csv", index=False)
df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


## XGBoost

In [18]:
!pip install xgboost
from xgboost import XGBClassifier



In [22]:
from sklearn.model_selection import GridSearchCV, cross_val_score

In [19]:
xgb_model = XGBClassifier().fit(X_train, y_train)

In [20]:
preds_xgb = xgb_model.predict(X_test)
accuracy_score(y_test, preds_xgb)

0.7932960893854749

In [21]:
xgb_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_samples_split": [2,5,10]}

In [23]:
xgb = XGBClassifier()

xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 10, n_jobs = -1, verbose = 2)

In [24]:
xgb_cv_model.fit(X_train, y_train)

Fitting 10 folds for each of 576 candidates, totalling 5760 fits
Parameters: { "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an

In [25]:
xgb_cv_model.best_params_

{'learning_rate': 0.02,
 'max_depth': 6,
 'min_samples_split': 2,
 'n_estimators': 100,
 'subsample': 0.6}

In [26]:
xgb = XGBClassifier(learning_rate=0.02,
                    max_depth=6,
                    min_samples_split=2,
                    n_estimators=100,
                    subsample=0.6)

In [27]:
xgb_tuned =  xgb.fit(X_train, y_train)

Parameters: { "min_samples_split" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [28]:
preds2 = xgb_tuned.predict(X_test)
accuracy_score(y_test, preds2)

0.8044692737430168