- In this assignment, you are going to use your model to predict survival of the Titanic disaster. For this assignment, send a link to a Jupyter notebook containing solutions to the following tasks.
  - Download Titanic data from Kaggle. The data in the train.csv file meets your need.
  - Split your data into training and test sets.
  - Predict the survival based on the test data you split by creating your model.
  - Is your model's performance satisfactory? Explain.
  - Try to improve your model's performance by adding or subtracting some variables.
- Explore the advantages and disadvantages of Logistic Regression and discuss with your mentor.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('train.csv')

df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df['Age'] = df['Age'].fillna(int(df['Age'].mean())).astype('int64')
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].value_counts().index[0])
df.drop('PassengerId', axis=1, inplace=True)
df.drop('Cabin', axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df['Sex'] = pd.get_dummies(df['Sex'], drop_first=True).rename(columns={'male':'Sex'})

df = pd.concat([df.drop('Embarked', axis=1), pd.get_dummies(df['Embarked'], drop_first=True).rename({})], axis=1)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Q,S
0,0,3,1,22,1,0,7.25,0,1
1,1,1,0,38,1,0,71.2833,0,0
2,1,3,0,26,0,0,7.925,0,1
3,1,1,0,35,1,0,53.1,0,1
4,0,3,1,35,0,0,8.05,0,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    uint8  
 3   Age       891 non-null    int64  
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Q         891 non-null    uint8  
 8   S         891 non-null    uint8  
dtypes: float64(1), int64(5), uint8(3)
memory usage: 44.5 KB


In [7]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [9]:
lr = LogisticRegression(max_iter=1000)

lr.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [10]:
train_accuracy = lr.score(X_train, y_train)
test_accuracy = lr.score(X_test, y_test)

In [11]:
print("Train accuracy: {}".format(train_accuracy))
print("Test accuracy: {}".format(test_accuracy))

Train accuracy: 0.8003355704697986
Test accuracy: 0.8135593220338984


In [12]:
C_values = [0.001,0.01, 0.1,1,10,100, 1000]

accuracy_values = pd.DataFrame(columns=['C_values', 'Train Accuracy', 'Test Accuracy'])

for c in C_values:
    lr = LogisticRegression(max_iter=1000, penalty = 'l2', C = c, random_state = 0, solver='lbfgs')
    lr.fit(X_train, y_train)
    accuracy_values = accuracy_values.append({'C_values': c,
                                              'Train Accuracy': lr.score(X_train, y_train),
                                              'Test Accuracy': lr.score(X_test, y_test)
                                             }, ignore_index=True)
display(accuracy_values)    

Unnamed: 0,C_values,Train Accuracy,Test Accuracy
0,0.001,0.67953,0.661017
1,0.01,0.728188,0.715254
2,0.1,0.810403,0.823729
3,1.0,0.800336,0.813559
4,10.0,0.798658,0.813559
5,100.0,0.800336,0.813559
6,1000.0,0.798658,0.813559


c = 0.1 returns best results for both train and test accuracy.

In [13]:
def select_columns(columns):
    X_selected = X.iloc[:, columns]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.33, random_state=42)
    lr = LogisticRegression(max_iter=1000, penalty = 'l2', C = 0.1, random_state = 0, solver='lbfgs')
    lr.fit(X_train, y_train)
    train_accuracy = lr.score(X_train, y_train)
    test_accuracy = lr.score(X_test, y_test)
    column_scores = pd.DataFrame(columns=['Columns', 'Train Accuracy', 'Test Accuracy'])
    column_scores = column_scores.append({'Columns': ', '.join(list(X_selected.columns)),
                                          'Train Accuracy': lr.score(X_train, y_train),
                                          'Test Accuracy': lr.score(X_test, y_test)
                                         }, ignore_index=True)
    return column_scores

In [14]:
df_selected = pd.DataFrame(columns=['Columns', 'Train Accuracy', 'Test Accuracy'])
df_selected = df_selected.append(select_columns([0, 1, 2, 3, 4, 5, 6, 7]))
df_selected = df_selected.append(select_columns([0, 1, 2, 3, 4, 5, 6]))
df_selected = df_selected.append(select_columns([0, 1, 2, 3, 4, 5, 7]))
df_selected = df_selected.append(select_columns([0, 1, 2, 3, 4, 6, 7]))
df_selected = df_selected.append(select_columns([0, 1, 2, 3, 5, 6, 7]))
df_selected = df_selected.append(select_columns([0, 1, 2, 4, 5, 6, 7]))
df_selected = df_selected.append(select_columns([0, 1, 3, 4, 5, 6, 7]))
df_selected = df_selected.append(select_columns([0, 2, 3, 4, 5, 6, 7]))
df_selected = df_selected.append(select_columns([1, 2, 3, 4, 5, 6, 7]))

df_selected = df_selected.append(select_columns([0, 1, 2, 3, 4, 5]))
df_selected = df_selected.append(select_columns([0, 1, 2, 3, 4, 7]))
df_selected = df_selected.append(select_columns([0, 1, 2, 3, 6, 7]))
df_selected = df_selected.append(select_columns([0, 1, 2, 5, 6, 7]))
df_selected = df_selected.append(select_columns([0, 1, 4, 5, 6, 7]))
df_selected = df_selected.append(select_columns([0, 3, 4, 5, 6, 7]))
df_selected = df_selected.append(select_columns([2, 3, 4, 5, 6, 7]))

df_selected = df_selected.append(select_columns([0, 1, 2, 3, 4]))
df_selected = df_selected.append(select_columns([0, 1, 2, 3, 7]))
df_selected = df_selected.append(select_columns([0, 1, 2, 6, 7]))
df_selected = df_selected.append(select_columns([0, 1, 5, 6, 7]))
df_selected = df_selected.append(select_columns([0, 4, 5, 6, 7]))
df_selected = df_selected.append(select_columns([3, 4, 5, 6, 7]))

df_selected = df_selected.append(select_columns([0, 1, 2, 3]))
df_selected = df_selected.append(select_columns([0, 1, 2, 7]))
df_selected = df_selected.append(select_columns([0, 1, 6, 7]))
df_selected = df_selected.append(select_columns([0, 5, 6, 7]))
df_selected = df_selected.append(select_columns([4, 5, 6, 7]))

df_selected = df_selected.append(select_columns([0, 1, 2]))
df_selected = df_selected.append(select_columns([0, 1, 7]))
df_selected = df_selected.append(select_columns([0, 6, 7]))
df_selected = df_selected.append(select_columns([5, 6, 7]))

df_selected = df_selected.append(select_columns([0, 1]))
df_selected = df_selected.append(select_columns([0, 7]))
df_selected = df_selected.append(select_columns([6, 7]))

df_selected = df_selected.reset_index(drop=True)

print("Sorted first by Train Accuracy, then by Test Accuracy:")
display(df_selected.sort_values(['Train Accuracy', 'Test Accuracy'], ascending=False).head())
print("Sorted first by Test Accuracy, then by Train Accuracy:")
display(df_selected.sort_values(['Test Accuracy', 'Train Accuracy'], ascending=False).head())

Sorted first by Train Accuracy, then by Test Accuracy:


Unnamed: 0,Columns,Train Accuracy,Test Accuracy
24,"Pclass, Sex, Q, S",0.813758,0.776271
28,"Pclass, Sex, S",0.813758,0.776271
13,"Pclass, Sex, Parch, Fare, Q, S",0.812081,0.783051
3,"Pclass, Sex, Age, SibSp, Parch, Q, S",0.810403,0.827119
10,"Pclass, Sex, Age, SibSp, Parch, S",0.810403,0.827119


Sorted first by Test Accuracy, then by Train Accuracy:


Unnamed: 0,Columns,Train Accuracy,Test Accuracy
17,"Pclass, Sex, Age, SibSp, S",0.808725,0.830508
16,"Pclass, Sex, Age, SibSp, Parch",0.798658,0.830508
22,"Pclass, Sex, Age, SibSp",0.79698,0.830508
3,"Pclass, Sex, Age, SibSp, Parch, Q, S",0.810403,0.827119
10,"Pclass, Sex, Age, SibSp, Parch, S",0.810403,0.827119


### Normalization

In [15]:
normalizer = Normalizer()

X_normalized = normalizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.33, random_state=42)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

train_accuracy_norm = lr.score(X_train, y_train)
test_accuracy_norm = lr.score(X_test, y_test)

print("Train accuracy: {}".format(train_accuracy))
print("Test accuracy: {}\n".format(test_accuracy))
print("Train accuracy (Normalized): {}".format(train_accuracy_norm))
print("Test accuracy (Normalized): {}".format(test_accuracy_norm))

Train accuracy: 0.8003355704697986
Test accuracy: 0.8135593220338984

Train accuracy (Normalized): 0.674496644295302
Test accuracy (Normalized): 0.6915254237288135


### Standardization

In [16]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)

lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

train_accuracy_std = lr.score(X_train, y_train)
test_accuracy_std = lr.score(X_test, y_test)

print("Train accuracy: {}".format(train_accuracy))
print("Test accuracy: {}\n".format(test_accuracy))
print("Train accuracy (Standardized): {}".format(train_accuracy_std))
print("Test accuracy (Standardized): {}".format(test_accuracy_std))

Train accuracy: 0.8003355704697986
Test accuracy: 0.8135593220338984

Train accuracy (Standardized): 0.7986577181208053
Test accuracy (Standardized): 0.8169491525423729
