In [125]:
import pandas as pd
import numpy as np

In [126]:
# Import train and test datasets

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [127]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [128]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [129]:
# Check Null values
print(train.isnull().sum())
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [130]:
# Drop useless columns for model

train.drop(columns=['Name','Ticket', 'Fare', 'Cabin'], axis=1,inplace=True)
test.drop(columns=['Name','Ticket', 'Fare', 'Cabin'], axis=1,inplace=True)

In [131]:
# Null values
print(train.isnull().sum())
print(test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Embarked         2
dtype: int64
PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Embarked        0
dtype: int64


In [132]:
# Filling null age values with Mean age in train dataset

train_mean = train['Age'].mean()
train['Age'] = train['Age'].fillna(train_mean)

In [133]:
# Filling null age values with Mean age in test dataset

test_mean = test['Age'].mean()
test['Age'] = test['Age'].fillna(test_mean)

In [134]:
# Check the most commom value for 'Embarked' to use it to fill null values

train['Embarked'].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [135]:
# Fill null values in Embarked column with 'S', which is the mode

train['Embarked'] = train['Embarked'].fillna('S')

In [136]:
# Now we process only text columns
print(train['Sex'].value_counts())

print(train['Embarked'].value_counts())

Sex
male      577
female    314
Name: count, dtype: int64
Embarked
S    646
C    168
Q     77
Name: count, dtype: int64


In [137]:
# change Sex column from male/female to 0/1

train['Sex'] = train['Sex'].apply(lambda x: 1 if x=='male' else 0)
test['Sex'] = test['Sex'].apply(lambda x: 1 if x=='male' else 0)

In [138]:
# Now I will use OneHotEncoder to process the other text column: 'Embarked'

from sklearn.preprocessing import OneHotEncoder

# create the Encoder
ohe = OneHotEncoder(handle_unknown='ignore')

# fit data
ohe = ohe.fit(train[['Embarked']])

# transform data
ohe.transform(train[['Embarked']]).toarray()

# create dataframe for the new values for the Embarked column
ohe_df = pd.DataFrame(ohe.transform(train[['Embarked']]).toarray(), columns=ohe.get_feature_names_out())

In [139]:
# add new columns from the OHE to the train dataset

train = pd.concat([train, ohe_df], axis=1)
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,1,22.0,1,0,S,0.0,0.0,1.0
1,2,1,1,0,38.0,1,0,C,1.0,0.0,0.0
2,3,1,3,0,26.0,0,0,S,0.0,0.0,1.0
3,4,1,1,0,35.0,1,0,S,0.0,0.0,1.0
4,5,0,3,1,35.0,0,0,S,0.0,0.0,1.0


In [140]:
# Repeat same process to test data

ohe_df = pd.DataFrame(ohe.transform(test[['Embarked']]).toarray(), columns=ohe.get_feature_names_out())

In [141]:
# add new columns from the OHE to the train dataset

test = pd.concat([test, ohe_df], axis=1)
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,892,3,1,34.5,0,0,Q,0.0,1.0,0.0
1,893,3,0,47.0,1,0,S,0.0,0.0,1.0
2,894,2,1,62.0,0,0,Q,0.0,1.0,0.0
3,895,3,1,27.0,0,0,S,0.0,0.0,1.0
4,896,3,0,22.0,1,1,S,0.0,0.0,1.0


In [142]:
# Exclude old Embarked column from both datasets

train = train.drop(['Embarked'], axis=1)
test = test.drop(['Embarked'], axis=1)

In [143]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,1,22.000000,1,0,0.0,0.0,1.0
1,2,1,1,0,38.000000,1,0,1.0,0.0,0.0
2,3,1,3,0,26.000000,0,0,0.0,0.0,1.0
3,4,1,1,0,35.000000,1,0,0.0,0.0,1.0
4,5,0,3,1,35.000000,0,0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,27.000000,0,0,0.0,0.0,1.0
887,888,1,1,0,19.000000,0,0,0.0,0.0,1.0
888,889,0,3,0,29.699118,1,2,0.0,0.0,1.0
889,890,1,1,1,26.000000,0,0,1.0,0.0,0.0


In [144]:
# Set feature and target and split the train dataset in X and Y

from sklearn.model_selection import train_test_split

X = train.drop(['PassengerId', 'Survived'], axis=1)
y = train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

In [145]:
# Random Forest

from sklearn import tree

RandomForest = tree.DecisionTreeClassifier(random_state=42)

RandomForest = RandomForest.fit(X_train,y_train)

RandomForest_predict = RandomForest.predict(X_val)
RandomForest_predict

array([0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0], dtype=int64)

In [146]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn = knn.fit(X_train, y_train)

knn_predict = knn.predict(X_val)
knn_predict

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0], dtype=int64)

In [147]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

LogisticRegression = LogisticRegression(random_state=42)

LogisticRegression = LogisticRegression.fit(X_train, y_train)

LR_predict = LogisticRegression.predict(X_val)
LR_predict

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0], dtype=int64)

In [148]:
# Accuracy

from sklearn.metrics import accuracy_score

print(f"Random Forest: {accuracy_score(y_val,RandomForest_predict)}")
print(f"KNN: {accuracy_score(y_val,knn_predict)}")
print(f"Logistic Regression: {accuracy_score(y_val,LR_predict)}")

Random Forest: 0.7388059701492538
KNN: 0.753731343283582
Logistic Regression: 0.8097014925373134


In [149]:
# Confusion matrix

from sklearn.metrics import confusion_matrix

In [150]:
# Confusion matrix for RandomForest

confusion_matrix(y_val, RandomForest_predict)

array([[126,  31],
       [ 39,  72]], dtype=int64)

In [151]:
# Confusion matrix for Knn

confusion_matrix(y_val, knn_predict)

array([[136,  21],
       [ 45,  66]], dtype=int64)

In [152]:
# Confusion matrix for Logistic Regression

confusion_matrix(y_val, LR_predict)

array([[136,  21],
       [ 30,  81]], dtype=int64)

In [154]:
# drop PassengerId to get both X_train and test datasets equal

X_test = test.drop('PassengerId', axis=1)

In [156]:
# Predict test data

y_pred = LogisticRegression.predict(X_test)

In [158]:
test['Survived'] = y_pred

In [162]:
output_data = test[['PassengerId','Survived']]
output_data

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [163]:
output_data.to_csv('submission2.csv', index=False)