In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import time
import warnings
warnings.filterwarnings('ignore')

# AutoML을 이용한 ML 구현
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder

In [3]:
df = pd.read_csv('./train.csv')
df_evaluation = pd.read_csv('./test.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# PassengerId: id of passenger - can remove
# Survived - target variable
# Pclass: class of passenger
# Name: - can remove
# Sex: categorical variable
# Age: have to bin, there are some NaN
# SibSp: Number of Siblings/Spouses Aboard
# Parch: Number of Parents/Children Aboard
# Ticket - can remove
# Fare: have to bin
# Cabin: seat/room number - too many NaN
# Embarked: place where get on the ship

In [8]:
df['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [9]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [10]:
df['Cabin'].value_counts()

B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: Cabin, Length: 147, dtype: int64

In [11]:
df['Parch'].value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [12]:
def preprocess_dataframe(df):    
    # fill na
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Cabin'] = df['Cabin'].str[:1]
    
    # continuous variable - binning 
    NUM_OF_BIN = 5
    count, bins_dividers = np.histogram(df['Age'], bins=NUM_OF_BIN)
    bin_names = [i for i in range(1, NUM_OF_BIN + 1)]
    df['Age'] = pd.cut(x=df['Age'],
                       bins=bins_dividers,
                       labels=bin_names,
                       include_lowest = True)
    
    count, bins_dividers = np.histogram(df['Fare'], bins=NUM_OF_BIN)
    bin_names = [i for i in range(1, NUM_OF_BIN + 1)]
    df['Fare'] = pd.cut(x=df['Fare'],
                       bins=bins_dividers,
                       labels=bin_names,
                       include_lowest = True)
    
    # extract Mr, Mrs, Miss, Master, Dr from name
    df['Name'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Name'] = np.where(df['Name'].isin(['Mr', 'Miss', 'Mrs', 'Master', 'Dr']), df['Name'], 'Others')
    
    # encoding
    for feature in ['Cabin', 'Sex', 'Embarked', 'Name']:
        encoder = LabelEncoder()
        encoder = encoder.fit(df[feature])
        df[feature] = encoder.transform(df[feature])
        
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 1, 'IsAlone'] = 1
        
    # drop unnecessary columns
    df = df.drop(['Ticket', 'FamilySize'], axis='columns')

    
    return df

In [13]:
from sklearn.model_selection import train_test_split
import random

def train_model(df_train):
    train_x = df_train.drop(['PassengerId', 'Survived'], axis='columns')
    train_y = df[['Survived']]

    pipe = Pipeline([('preprocessing', None), ('regressor', None)])
    pre_list = [StandardScaler(), MinMaxScaler(), None]
    hyperparam_grid = [
        # classification
        # LogisticRegression
        {'regressor': [LogisticRegression()], 'preprocessing': pre_list,
         'regressor__C': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
        # DecisionTree
        {'regressor': [DecisionTreeClassifier()], 'preprocessing': pre_list,
         'regressor__max_depth': [3, 5, 7, 11], 'regressor__min_samples_split': [2, 3, 5],
         'regressor__min_samples_leaf': [1, 5, 8]},
        # RandomForest
        {'regressor': [RandomForestClassifier()], 'preprocessing': pre_list,
         'regressor__max_depth': [5, 6, 7, 8, 9], 'regressor__min_samples_split': [3, 4, 5],
         'regressor__min_samples_leaf': [1, 2]},
        # Support Vector Classifier
        {'regressor': [SVC()], 'preprocessing': pre_list,
         'regressor__C': [0.1, 1, 3, 5, 10], 'regressor__kernel': ['poly', 'rbf', 'sigmoid'],
         'regressor__gamma': ['scale', 'auto']},
        # Gradient Boosting Classifer
        {'regressor': [GradientBoostingClassifier()], 'preprocessing': pre_list,
         'regressor__learning_rate': [0.001, 0.01, 0.1, 1, 3, 5],
         'regressor__n_estimators': [30, 50, 100, 200]},
        # Gaussian Naive Bayes
        {'regressor': [GaussianNB()], 'preprocessing': pre_list}
        
#         # Regrssion
#         # LinearRegression
#         {'regressor': [LogisticRegression()], 'preprocessing': pre_list,
#          'regressor__C': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
#         # Ridge
#         {'regressor': [Ridge()], 'preprocessing': pre_list,
#          'regressor__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
#         # Lasso
#         {'regressor': [Lasso()], 'preprocessing': pre_list,
#          'regressor__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
#         # Support Vector Regressor
#         {'regressor': [SVR()], 'preprocessing': pre_list,
#          'regressor__epsilon': [0.001, 0.01, 0.1, 1, 10],
#          'regressor__C': [0.0001, 0.001, 0.01, 0.1, 1, 10]},
#         # MLP
#         {'regressor': [MLPRegressor()], 'preprocessing': pre_list,
#          'regressor__hidden_layer_sizes': [(100,) , (10, 10, )],
#          'regressor__activation': ['relu', 'logistic'],
#          'regressor__solver': ['lbfgs', 'adam'],
#          'regressor__alpha': [0.0001, 0.01, 1]},
#         # XGB
#         {'regressor': [XGBRegressor(objective='reg:squarederror')],
#          'preprocessing': pre_list,
#          'regressor__learning_rate': [0.0, 0.1, 0.09, 0.089, 0.08],
#          'regressor__booster': ['gbtree', 'gblinear', 'dart']}
    ]
    kfold = KFold(n_splits=7, shuffle=True, random_state=1)
    grid = GridSearchCV(pipe, hyperparam_grid, scoring='accuracy', refit=True, cv=kfold)
    grid.fit(train_x, train_y)
    print(grid.best_estimator_)
    print(grid.best_params_)
    print(grid.best_score_)
    
    return grid

In [14]:
def predict_result(df_test, grid):
    df_pid = df_test[['PassengerId']]
    test_x = df_test.drop(['PassengerId'], axis='columns')
    
    estimator = grid.best_estimator_
    result_list = estimator.predict(test_x)
    df_survived = pd.DataFrame({'Survived': result_list})
    
    df_result = df_pid.join(df_survived)
    
    return df_result

In [15]:
start_time = time.time()

df = pd.read_csv('./train.csv')
df_evaluation = pd.read_csv('./test.csv')

df = preprocess_dataframe(df)
df_evaluation = preprocess_dataframe(df_evaluation)

grid = train_model(df)
df_result = predict_result(df_evaluation, grid)
df_result.to_csv('./result.csv', index=None)

print('job running time %f sec' % (time.time() - start_time))

Pipeline(steps=[('preprocessing', StandardScaler()), ('regressor', SVC(C=1))])
{'preprocessing': StandardScaler(), 'regressor': SVC(C=1), 'regressor__C': 1, 'regressor__gamma': 'scale', 'regressor__kernel': 'rbf'}
0.8317368532058493
job running time 121.591238 sec
