In [113]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [114]:
import os
TITANIC_PATH = os.path.join("datasets", "titanic")
def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

Wczytywanie danych

In [115]:
train_data = load_titanic_data("train.csv")
test_data = load_titanic_data("test.csv")

Sprawdzanie otrzymanych danych

In [116]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [117]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [118]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [119]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [120]:
train_data['Survived'].value_counts()/(len(train_data))

0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [121]:
X_train = train_data.drop(['Survived'], axis = 1)
y_train = train_data['Survived'].values

In [122]:
print(train_data.groupby(['Pclass', 'Sex']).median()['Age'])

Pclass  Sex   
1       female    35.0
        male      40.0
2       female    28.0
        male      30.0
3       female    21.5
        male      25.0
Name: Age, dtype: float64


In [123]:
X_train['Age'] = train_data.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x : x.fillna(x.mean()))
test_data['Age'] = test_data.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x : x.fillna(x.mean()))

In [124]:
train_data['Ticket'].value_counts()

347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: Ticket, Length: 681, dtype: int64

In [125]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [126]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


num_pipeline = Pipeline([
        ("select_numeric", DataFrameSelector(["Age","SibSp","Parch","Fare"])),
        ("imputer", SimpleImputer(strategy="median")),
    ])

In [127]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [128]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
        ("select_cat", DataFrameSelector(["Pclass","Sex","Embarked","Ticket"])),
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False, handle_unknown = 'ignore')),
    ])

In [129]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

In [130]:
preprocess_pipeline.fit_transform(X_train)

array([[22.  ,  1.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [38.  ,  1.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [26.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       ...,
       [21.75,  1.  ,  2.  , ...,  0.  ,  0.  ,  0.  ],
       [26.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [32.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ]])

In [131]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits = 10, random_state= 42, shuffle= True)

Testowanie modeli

In [132]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier


In [133]:
pipe_log = Pipeline([
  ('preprocessing', preprocess_pipeline), 
  ('scaler', StandardScaler()),
  ('classifier', LogisticRegression(C=1, solver='newton-cg', max_iter = 5000))
])


param_grid_log = {
  'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
  'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100]
}

log_clf = GridSearchCV(pipe_log, param_grid_log, cv=kfold)

log_clf.fit(X_train, y_train)
log_clf.best_params_
#{'classifier__C': 1, 'classifier__solver': 'sag'}

{'classifier__C': 0.1, 'classifier__solver': 'newton-cg'}

In [134]:
pred = log_clf.predict(test_data)
passenger_id = test_data['PassengerId'].values
res = pd.DataFrame({'PassengerId':passenger_id,'Survived':pred})
res.to_csv('predictions/pred_log_clf.csv', index = False)

In [135]:
pipe_lin = Pipeline([
  ('preprocessing', preprocess_pipeline), 
  ('scaler', StandardScaler()),
  ('classifier', SVC(kernel = 'linear'))
])

param_grid_lin = {
  'classifier__C': np.logspace(-8, 2, 11),
  'classifier__gamma': np.logspace(-25, -15, 11)
}

lin_clf = GridSearchCV(pipe_lin, param_grid_lin, cv=kfold, verbose= 2)

lin_clf.fit(X_train, y_train)
lin_clf.best_params_
#{'classifier__C': 0.1, 'classifier__gamma': 1e-25}

Fitting 10 folds for each of 121 candidates, totalling 1210 fits
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-25; total time=   0.0s
[CV] END .......classifier__C=1e-08, classifier__gamma=1e-24; total time=   0.0s
[CV] END .......classifier__C=1e-08, classif

{'classifier__C': 0.01, 'classifier__gamma': 1e-25}

In [136]:
pred = lin_clf.predict(test_data)
passenger_id = test_data['PassengerId'].values
res = pd.DataFrame({'PassengerId':passenger_id,'Survived':pred})
res.to_csv('predictions/pred_lin_clf.csv', index = False)

In [137]:
pipe_rbf = Pipeline([
  ('preprocessing', preprocess_pipeline), 
  ('scaler', StandardScaler()),
  ('classifier', SVC(kernel = 'rbf')),
])

param_grid_rbf = {
  'classifier__C': np.logspace(-5, 5, 11),
  'classifier__gamma': np.logspace(-5, 5, 11),
}

rbf_clf = GridSearchCV(pipe_rbf, param_grid_rbf, cv=kfold, verbose= 2)

rbf_clf.fit(X_train, y_train)
rbf_clf.best_params_
#{'classifier__C': 100.0, 'classifier__gamma': 0.0001}

Fitting 10 folds for each of 121 candidates, totalling 1210 fits
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END .......classifier__C=1e-05, classifier__gamma=1e-05; total time=   0.0s
[CV] END ......classifier__C=1e-05, classifier__gamma=0.0001; total time=   0.0s
[CV] END ......classifier__C=1e-05, classifi

{'classifier__C': 10000.0, 'classifier__gamma': 1e-05}

In [138]:
pred = rbf_clf.predict(test_data)
passenger_id = test_data['PassengerId'].values
res = pd.DataFrame({'PassengerId':passenger_id,'Survived':pred})
res.to_csv('predictions/pred_rbf_clf.csv', index = False)

In [139]:
pipe_poly = Pipeline([
  ('preprocessing', preprocess_pipeline), 
  ('scaler', StandardScaler()),
  ('classifier', SVC(kernel = 'poly')),
])

param_grid_poly = {
  'classifier__gamma': [0.001, 0.01, 0.1, 1],
  'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100,1000, 10000],
  'classifier__degree': [1,2,3,4],
  'classifier__coef0': [0, 1]
}

poly_clf = GridSearchCV(pipe_poly, param_grid_poly, cv=kfold, n_jobs= 4, verbose= 2)

poly_clf.fit(X_train, y_train)
poly_clf.best_params_
#{'classifier__C': 100,
# 'classifier__coef0': 1,
# 'classifier__degree': 2,
# 'classifier__gamma': 0.01}

Fitting 10 folds for each of 256 candidates, totalling 2560 fits


KeyboardInterrupt: 

In [None]:
pred = poly_clf.predict(test_data)
passenger_id = test_data['PassengerId'].values
res = pd.DataFrame({'PassengerId':passenger_id,'Survived':pred})
res.to_csv('predictions/pred_poly_clf.csv', index = False)

In [140]:
pipe_forest = Pipeline([
  ('preprocessing', preprocess_pipeline), 
  ('scaler', StandardScaler()),
  ('classifier', RandomForestClassifier()),
])

param_grid_forest = {
  'classifier__n_estimators': [100, 200, 300, 400], 
  'classifier__max_depth': [4, 8, 12, 16, 20],
  'classifier__max_features': [2, 4, 6, 8, 10]
}

rf_clf = GridSearchCV(pipe_forest, param_grid_forest, cv=kfold, verbose= 2)

rf_clf.fit(X_train, y_train)
rf_clf.best_params_

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
[CV] END classifier__max_depth=4, classifier__max_features=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__max_depth=4, classifier__max_features=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__max_depth=4, classifier__max_features=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__max_depth=4, classifier__max_features=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__max_depth=4, classifier__max_features=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__max_depth=4, classifier__max_features=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__max_depth=4, classifier__max_features=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__max_depth=4, classifier__max_features=2, classifier__n_estimators=100; total time=   0.0s
[CV] END classifier__max_depth=4, classifier__m

{'classifier__max_depth': 20,
 'classifier__max_features': 10,
 'classifier__n_estimators': 200}

In [141]:
pred = rf_clf.predict(test_data)
passenger_id = test_data['PassengerId'].values
res = pd.DataFrame({'PassengerId':passenger_id,'Survived':pred})
res.to_csv('predictions/pred_rf_clf.csv', index = False)