In [39]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Load data

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.info()
print(40*"-")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Passenger

In [5]:
labels = train.pop('Survived')

In [6]:
n_train = train.index

train_id = train.pop('PassengerId')
test_id = test.pop('PassengerId')

df = pd.concat([train, test])
df = df.reset_index(drop=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    1309 non-null   int64  
 1   Name      1309 non-null   object 
 2   Sex       1309 non-null   object 
 3   Age       1046 non-null   float64
 4   SibSp     1309 non-null   int64  
 5   Parch     1309 non-null   int64  
 6   Ticket    1309 non-null   object 
 7   Fare      1308 non-null   float64
 8   Cabin     295 non-null    object 
 9   Embarked  1307 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 102.4+ KB


### Clean data

In [8]:
pd.DataFrame({'values': df.isnull().sum(),
             'Percent': (df.isnull().sum() / len(df)) * 100}).sort_values(by='Percent', ascending=False)

Unnamed: 0,values,Percent
Cabin,1014,77.463713
Age,263,20.091673
Embarked,2,0.152788
Fare,1,0.076394
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0


In [9]:
df = df.drop('Cabin', axis=1)

In [10]:
df['title'] = df['Name'].map(lambda x: x.split(',')[1].split(' ')[1])

df['title'].value_counts()

Mr.          757
Miss.        260
Mrs.         197
Master.       61
Dr.            8
Rev.           8
Col.           4
Ms.            2
Major.         2
Mlle.          2
the            1
Lady.          1
Mme.           1
Don.           1
Sir.           1
Capt.          1
Jonkheer.      1
Dona.          1
Name: title, dtype: int64

In [11]:
titles = {'Mr.': 'Mr',
         'Miss.': 'Miss',
         'Mrs.': 'Mrs',
         'Master.': 'Master',
         'Rev.': 'Rev',
         'Dr.': 'Dr'}

df['title'] = df['title'].map(lambda x: titles.get(x, 'Other'))

In [12]:
age_median = df.groupby('title')['Age'].median().to_dict()

df['Age'] =  df['Age'].fillna(df['title'].map(age_median))

In [13]:
# Fill rest empty values
df['Fare'].fillna(df['Fare'].median(), inplace=True)
df['Embarked'].fillna('S', inplace=True)

In [14]:
pd.DataFrame({'values': df.isnull().sum(),
             'Percent': (df.isnull().sum() / len(df)) * 100}).sort_values(by='Percent', ascending=False)


Unnamed: 0,values,Percent
Pclass,0,0.0
Name,0,0.0
Sex,0,0.0
Age,0,0.0
SibSp,0,0.0
Parch,0,0.0
Ticket,0,0.0
Fare,0,0.0
Embarked,0,0.0
title,0,0.0


### Transform categorical to numerical

In [15]:
df_train = df.copy()

In [16]:
# Conver sex on numerical 
df_train['Sex'] = df_train['Sex'].map(lambda x: int(x == 'male'))

# One hot encoding for title and embarked

for feat in ['title', 'Embarked']: 
    dummies = pd.get_dummies(df_train[feat], prefix='{0}_'.format(feat), drop_first=True)
    df_train = pd.concat([df_train, dummies], axis=1)

In [17]:
# Select train and test part

train_data = df_train.iloc[n_train, :]
test_data = df_train.iloc[len(n_train):, :]

In [18]:
train_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,title,title__Master,title__Miss,title__Mr,title__Mrs,title__Other,title__Rev,Embarked__Q,Embarked__S
0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,S,Mr,0,0,1,0,0,0,0,1
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C,Mrs,0,0,0,1,0,0,0,0
2,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,S,Miss,0,1,0,0,0,0,0,1
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,S,Mrs,0,0,0,1,0,0,0,1
4,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,S,Mr,0,0,1,0,0,0,0,1


### Machine leaning

In [19]:
# Create list of models 

svc_clf = SVC(kernel='poly', degree=2, probability=True)
xgb_clf = XGBClassifier()
log_clf = LogisticRegression()
extr_clf = ExtraTreesClassifier()
knn_clf = KNeighborsClassifier()
lgbm_clf = LGBMClassifier()


models = {'svc': svc_clf,
         'xgb': xgb_clf,
         'log_reg': log_clf,
         'extra_tree': extr_clf,
         'knn': knn_clf,
         'lgmb': lgbm_clf}                                   

In [20]:
scalar = StandardScaler()

def train_models(df=train_data, models=models):
    """
    Get all numerical data from dataset and train list of models.
    Retrun dataframe with model's name and accuracy.
    """
    
    x = df[df.select_dtypes(include=np.number).columns].values
    x_scaled = scalar.fit_transform(x)

    accuracy_list = {}
    for model_name, model in models.items():
        score = cross_val_score(model, x_scaled, labels, cv=5, scoring='accuracy')
        accuracy_list[model_name] = np.mean(score)

    accuracy_list = pd.DataFrame(columns=['model', 'accuracy'], data=accuracy_list.items())
    
    return accuracy_list

In [21]:
models_accuracy = train_models()
models_accuracy

Unnamed: 0,model,accuracy
0,svc,0.828259
1,xgb,0.829421
2,log_reg,0.827148
3,extra_tree,0.801356
4,knn,0.811449
5,lgmb,0.830563


### Feature engineering

In [22]:
new_df = train_data.copy()

In [23]:
new_df['family_size'] = new_df['SibSp'] + new_df['Parch'] + 1

train_models(new_df)

Unnamed: 0,model,accuracy
0,svc,0.827136
1,xgb,0.83728
2,log_reg,0.827148
3,extra_tree,0.796868
4,knn,0.811456
5,lgmb,0.829427


In [24]:
new_df['is_single'] = new_df['family_size'].map(lambda x: int(x == 1))

train_models(new_df)

Unnamed: 0,model,accuracy
0,svc,0.827136
1,xgb,0.83728
2,log_reg,0.821543
3,extra_tree,0.789021
4,knn,0.808116
5,lgmb,0.829427


In [25]:
new_df['big_family'] = new_df['family_size'].map(lambda x: int(x > 5))

train_models(new_df)

Unnamed: 0,model,accuracy
0,svc,0.819271
1,xgb,0.83728
2,log_reg,0.819302
3,extra_tree,0.783397
4,knn,0.806993
5,lgmb,0.829427


## Hyperopt

In [26]:
X = train_data[train_data.select_dtypes(include=np.number).columns].values
X_scaled = scalar.fit_transform(X)

hyper_models = {}

In [27]:
# Select best parameters for svc_clf 

svc_params = {'C': np.arange(1, 15),
             'kernel': ('linear', 'poly', 'rbf', 'sigmoid')}

hyperopt_svc = RandomizedSearchCV(svc_clf, svc_params, cv=5, scoring='accuracy')
hyperopt_svc.fit(X_scaled, labels)

hyper_models['best_svc'] = hyperopt_svc.best_estimator_

In [28]:
# Select best parameters for xgb_clf 

xgb_params = {'n_estimators': np.arange(100, 1500, 10),
             'learning_rate': np.arange(0.01, 1, 0.01),
             'max_depth': np.arange(1, 20),
             'colsample_bytree': np.arange(0, 1, 0.1)}

hyperopt_xgb = RandomizedSearchCV(xgb_clf, xgb_params, cv=5, scoring='accuracy')
hyperopt_xgb.fit(X_scaled, labels)

hyper_models['best_xgb'] = hyperopt_xgb.best_estimator_

In [29]:
# Select best parameters for log_clf 

log_params = {'solver': ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'),
             'C': np.arange(1, 15)}

hyperopt_log = RandomizedSearchCV(log_clf, log_params, cv=5, scoring='accuracy')
hyperopt_log.fit(X_scaled, labels)

hyper_models['best_log'] = hyperopt_log.best_estimator_

In [30]:
# Select best parameters for extr_clf 

extr_params = {'n_estimators': np.arange(100, 1500, 10),
             'max_depth': np.arange(1, 20),
             'max_features': np.arange(1, 20),
             'min_samples_split': np.arange(1, 5)}

hyperopt_extr = RandomizedSearchCV(extr_clf, extr_params, cv=5, scoring='accuracy')
hyperopt_extr.fit(X_scaled, labels)

hyper_models['best_extr'] = hyperopt_extr.best_estimator_

In [31]:
# Select best parameters for knn_clf 

knn_params = {'n_neighbors': np.arange(2, 15),
             'weights': ('uniform', 'distance'),
             'leaf_size': np.arange(2, 50),
             'p': (1, 2)}

hyperopt_knn = RandomizedSearchCV(knn_clf, knn_params, cv=5, scoring='accuracy')
hyperopt_knn.fit(X_scaled, labels)

hyper_models['best_knn'] = hyperopt_knn.best_estimator_

In [32]:
# Select best parameters for lgbm_clf 

lgbm_params = {'n_estimators': np.arange(100, 1500, 10),
             'learning_rate': np.arange(0.01, 1, 0.01),
             'max_depth': np.arange(1, 20),
             'colsample_bytree': np.arange(0, 1, 0.1)}

hyperopt_lgbm = RandomizedSearchCV(lgbm_clf, lgbm_params, cv=5, scoring='accuracy')
hyperopt_lgbm.fit(X_scaled, labels)

hyper_models['best_lgbm'] = hyperopt_lgbm.best_estimator_

In [33]:
# Compare firsts models and models with new parameters

hyper_models_accuracy = train_models(models=hyper_models)
hyper_models_accuracy.rename(columns={'accuracy': 'hyper_accuracy'}, inplace=True)


pd.concat([models_accuracy, hyper_models_accuracy], axis=1)    

Unnamed: 0,model,accuracy,model.1,hyper_accuracy
0,svc,0.828259,best_svc,0.828291
1,xgb,0.829421,best_xgb,0.831674
2,log_reg,0.827148,best_log,0.827155
3,extra_tree,0.801356,best_extr,0.827148
4,knn,0.811449,best_knn,0.821537
5,lgmb,0.830563,best_lgbm,0.835051


In [34]:
# Select models with best accuracy

best_models = {'best_scv': svc_clf,
              'best_xgb': hyper_models['best_xgb'],
              'best_log': hyper_models['best_log'],
               #'best_extr': hyper_models['best_extr'],
              #'best_knn': hyper_models['best_knn'],
              'best_lgbm': hyper_models['best_lgbm']}

In [35]:
# Creating voting classifier with best models

voting_clf = VotingClassifier(estimators=[(model_name, model) for model_name, model in best_models.items()],
                                    voting='hard')
score = cross_val_score(voting_clf, X_scaled, labels, cv=5, scoring='accuracy')
print('Accuracy for voting classifier:', round(np.mean(score), 4))

Accuracy for voting classifier: 0.835


In [36]:
# Final model fitting
voting_clf.fit(X_scaled, labels)

VotingClassifier(estimators=[('best_scv',
                              SVC(C=1.0, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=2,
                                  gamma='scale', kernel='poly', max_iter=-1,
                                  probability=True, random_state=None,
                                  shrinking=True, tol=0.001, verbose=False)),
                             ('best_xgb',
                              XGBClassifier(base_score=0.5, booster='gbtree',
                                            colsample_bylevel=1,
                                            colsam...
                                             importance_type='split',
                                             learning_rate=0.77, max_depth=1,
                                             min_child_samples=20,
                                             min_child_weight=0.

### Test part

In [37]:
# Select numeric values
X_test = test_data[test_data.select_dtypes(include=np.number).columns].values

# Normalize X_test
X_test_scaled = scalar.fit_transform(X_test)

# Make prediction
y_pred = voting_clf.predict(X_test_scaled)

In [38]:
# Save to csv
predictions = pd.DataFrame({'PassengerId': test_id})
predictions['Survived'] = y_pred                        
predictions.to_csv('vot_submission.csv', index=False)