In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
import os
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

from scipy.stats.distributions import uniform, randint

from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier


In [11]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

print(train_data)
print(test_data)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

In [12]:
categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline(
    [
        ('imputer_cat', SimpleImputer(strategy = 'constant', fill_value = 'missing')),
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)

numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
numeric_transformer = Pipeline(
    [
        ('imputer_num', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    [
        ('categoricals', categorical_transformer, categorical_features),
        ('numericals', numeric_transformer, numeric_features)
    ],
    remainder = 'drop'
)

rskf = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 42)
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

X = train_data.drop('Survived', axis = 1)
y = train_data.Survived


In [13]:
# # rfc
# clf = RandomForestClassifier(
#     bootstrap=False,
#     verbose=0,
#     n_jobs=-1)

# param_distributions = {
#     'clf__n_estimators': randint(10, 400),
#     'clf__max_depth': np.linspace(1, 32, 32, endpoint=True),
#     'clf__min_samples_split': np.linspace(0.1, 1.0, 10, endpoint=True),
#     'clf__min_samples_leaf': np.linspace(0.1, 0.5, 5, endpoint=True)
# }

# param_grid = {
#     'clf__n_estimators': [100, 200],
#     'clf__max_depth': [16],
#     'clf__min_samples_split': [0.5],
#     'clf__min_samples_leaf': [0.3]
# }


# pipeline = Pipeline(
#     [
#         ('preprocessing', preprocessor),
#         ('clf', clf)
#     ]
# )

# cv = RandomizedSearchCV(
#     estimator=pipeline,
#     param_distributions=param_distributions,
#     n_iter=30,
#     scoring = ['f1', 'accuracy'],
#     n_jobs=-1,
#     refit = 'f1',
#     cv=rskf,
#     return_train_score=True
#     )

# # cv = GridSearchCV(
# #     estimator=pipeline,
# #     param_grid=param_grid,
# #     scoring = ['f1', 'accuracy'],
# #     n_jobs=-1,
# #     refit = 'f1',
# #     cv=rskf,
# #     return_train_score=True
# #     )


In [14]:
clf = XGBClassifier()

param_distributions = {
    'clf__max_depth': randint(3, 11),
    'clf__learning_rate': uniform(0.0001, 0.1),
    'clf__n_estimators': randint(50, 1000),
    'clf__gamma': uniform(0,2),
    'clf__colsample_bytree': uniform(0.5, 0.4),
    'clf__subsample': uniform(0.3, 0.6),
    'clf__min_child_weight': randint(1, 4)
}

pipeline = Pipeline(
    [
        ('preprocessing', preprocessor),
        ('clf', clf)
    ]
)

# print(pipeline.get_params().keys())

cv = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=30,
    scoring = ['f1', 'accuracy'],
    n_jobs=-1,
    refit = 'f1',
    cv=rskf,
    return_train_score=True
    )

In [15]:
# param_distributions = {
#     'clf__base_estimator__max_depth': randint(3, 11),
#     'clf__base_estimator__learning_rate': uniform(0.0001, 0.1),
#     'clf__base_estimator__n_estimators': randint(50, 1000),
#     'clf__base_estimator__gamma': uniform(0,2),
#     'clf__base_estimator__colsample_bytree': uniform(0.5, 0.4),
#     'clf__base_estimator__subsample': uniform(0.3, 0.6),
#     'clf__base_estimator__min_child_weight': randint(1, 4),
    
#     'clf__learning_rate': uniform(0.0001, 0.1),
#     'clf__n_estimators': randint(1, 100),
# }

# clf = AdaBoostClassifier(base_estimator=XGBClassifier())

# pipeline = Pipeline(
#     [
#         ('preprocessing', preprocessor),
#         ('clf', clf)
#     ]
# )

# print(pipeline.get_params().keys())

# cv = RandomizedSearchCV(
#     estimator=pipeline,
#     param_distributions=param_distributions,
#     n_iter=10,
#     scoring = ['f1', 'accuracy'],
#     n_jobs=-1,
#     refit = 'f1',
#     cv=rskf,
#     return_train_score=True
#     )

In [16]:
# cv.fit(X, y)

In [17]:
# print(f'Best F1-score: {cv.best_score_:.3f}\n')
# print(f'Best parameter set: {cv.best_params_}\n')
# print(f'Scores: {classification_report(y, cv.predict(X))}')
# print(cv.predict(test_data))

In [18]:
from datetime import datetime

for i in range(10):
    print(i)
    cv.fit(X, y)
    
    print(cv.best_score_)
    if cv.best_score_ > 0.775:
        now = datetime.now()

        current_time = now.strftime("%H%M%S")
        print("Current Time =", current_time)

        output = pd.DataFrame({
                "PassengerId": test_data["PassengerId"],
                "Survived": cv.predict(test_data)
                })
        output.to_csv("output_{}.csv".format(current_time), index=False)

0
0.7744152008542837
1
0.7707704925417295
2
0.7731946554122061
3
0.7698719108841717
4
0.7721321995383515
5
0.7738337875470487
6
0.7720447456538988
7
0.7707368246962576
8
0.7720837926567705
9
0.7742663632385727
