In [2]:
# Load data
import numpy as np
import os
import pandas as pd
np.random.seed(42)

def load_data(name):
    return pd.read_csv(name)

train = load_data("train.csv")
test = load_data("test.csv")

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Check if there is any nan values
pd.isnull(train).sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
# Check the types of each variable
train.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [6]:
# Repacing the nan values in Age with the mean
train["Age"].fillna(train["Age"].mean(), inplace=True)

In [7]:
# Replacing the nan values in the Cabin and embarked variable
import statistics
train["Cabin"].fillna("Unknown", inplace=True)
train["Embarked"].fillna(statistics.mode(train["Embarked"]), inplace=True)

In [8]:
# Check if there is any nan values
pd.isnull(train).sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [9]:
# Round up to nearest 10 000
import math
def roundup(x):
    return int(math.ceil(x / 10000.0))*10000

In [10]:
# Transforming Ticket variable
def transform_ticket(col):
    return_col = []
    for x in col:
        if " " in x:
            return_col.append(roundup(int(x.split(" ")[len(x.split(" "))-1])))
        elif x == "LINE":
            return_col.append(roundup(1))
        else:  
            return_col.append(roundup(int(x)))
    return return_col

transformed_ticket = transform_ticket(train["Ticket"])
train["Ticket"] = transformed_ticket    


In [11]:
# Transform Cabin variable
def transform_cabin(col):
    return_col = []
    for x in col:
        if x is not "Unknown":
            return_col.append(x[0])
        else:
            return_col.append(x)
    return return_col

transformed_cabin = transform_cabin(train["Cabin"])
train["Cabin"] = transformed_cabin

In [12]:
def transform_sex(x):
    ret = []
    for var in x:
        if var == 'female':
            ret.append(0)
        else:
            ret.append(1)
    return ret

In [13]:
train["Sex"] = transform_sex(train["Sex"])

In [14]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
cat_vars = train[["Cabin","Embarked"]]
cat_vars_hot = encoder.fit_transform(cat_vars)
encoder.categories_
# Replace the old values with the transformed variables
#train_t = train.join(cat_vars_hot.toarray())

[array(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T', 'Unknown'], dtype=object),
 array(['C', 'Q', 'S'], dtype=object)]

In [15]:
cols = ('Cab_A','Cab_B','Cab_C','Cab_D','Cab_E','Cab_F','Cab_G','Cab_T','Cab_U','Emb_C','Emb_Q','Emb_S')
temp_df = pd.DataFrame(cat_vars_hot.toarray(), columns=cols)
temp_df.head()

Unnamed: 0,Cab_A,Cab_B,Cab_C,Cab_D,Cab_E,Cab_F,Cab_G,Cab_T,Cab_U,Emb_C,Emb_Q,Emb_S
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [16]:
train = train.join(temp_df)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Cab_C,Cab_D,Cab_E,Cab_F,Cab_G,Cab_T,Cab_U,Emb_C,Emb_Q,Emb_S
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,30000,7.25,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,20000,71.2833,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,3110000,7.925,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,120000,53.1,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,380000,8.05,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [17]:
train = train.drop(['Cabin','Embarked',"Name"], axis=1)
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cab_A', 'Cab_B', 'Cab_C', 'Cab_D', 'Cab_E', 'Cab_F',
       'Cab_G', 'Cab_T', 'Cab_U', 'Emb_C', 'Emb_Q', 'Emb_S'],
      dtype='object')

In [18]:
# Normalizing the variables
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_scaled = scaler.fit_transform(train)
train_scaled = pd.DataFrame(train_scaled,columns=train.columns)
train_scaled.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cab_A,...,Cab_C,Cab_D,Cab_E,Cab_F,Cab_G,Cab_T,Cab_U,Emb_C,Emb_Q,Emb_S
0,0.0,0.0,1.0,1.0,0.271174,0.125,0.0,0.006452,0.014151,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,0.001124,1.0,0.0,0.0,0.472229,0.125,0.0,0.003226,0.139136,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.002247,1.0,1.0,0.0,0.321438,0.0,0.0,1.0,0.015469,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.003371,1.0,0.0,0.0,0.434531,0.125,0.0,0.035484,0.103644,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.004494,0.0,1.0,1.0,0.434531,0.0,0.0,0.119355,0.015713,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [25]:
# Split
y = train_scaled["Survived"]
X = train_scaled.drop(columns=["Survived"])

In [34]:
# Training a classification model to test performance
from sklearn.linear_model import LogisticRegression


log_reg = LogisticRegression(solver='lbfgs')
log_reg.fit(X,y)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [27]:
# Check MSE
from sklearn.metrics import mean_squared_error
preds = log_reg.predict(X)
lin_mse = mean_squared_error(y,preds)
lin_rmse = np.sqrt(lin_mse)
lin_rmse


0.4355163866123252

In [35]:
# Using cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, X,y, scoring="neg_mean_squared_error",cv=10)
rmse_scores = np.sqrt(-scores)

In [36]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

display_scores(rmse_scores)

Scores:  [0.4472136  0.47140452 0.44971901 0.35156152 0.46204236 0.44971901
 0.47404546 0.46204236 0.39661489 0.46466019]
Mean:  0.4429022937155686
Standard deviation:  0.036858577063674576


In [37]:
mean_log_reg = 0.443

In [39]:
from sklearn.feature_selection import RFECV
rfecv = RFECV(estimator=LogisticRegression(solver='lbfgs'), step=1, cv=10, scoring='accuracy')
rfecv.fit(X, y)
print("Optimal number of features: %d" % rfecv.n_features_)
print('Selected features: %s' % list(X.columns[rfecv.support_]))

Optimal number of features: 14
Selected features: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cab_C', 'Cab_D', 'Cab_E', 'Cab_F', 'Cab_G', 'Cab_U', 'Emb_S']


In [40]:
x_best = X['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cab_C', 'Cab_D', 'Cab_E', 'Cab_F', 'Cab_G', 'Cab_U', 'Emb_S']


KeyError: ('Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cab_C', 'Cab_D', 'Cab_E', 'Cab_F', 'Cab_G', 'Cab_U', 'Emb_S')