In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt 
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Cleaning Train Data

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df_train['Sex'].unique()

array(['male', 'female'], dtype=object)

In [5]:
df_train.drop(columns = ['Name'], inplace = True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,male,35.0,0,0,373450,8.05,,S


In [6]:
df_train.drop(columns = ['Ticket'], inplace = True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,7.25,,S
1,2,1,1,female,38.0,1,0,71.2833,C85,C
2,3,1,3,female,26.0,0,0,7.925,,S
3,4,1,1,female,35.0,1,0,53.1,C123,S
4,5,0,3,male,35.0,0,0,8.05,,S


In [7]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
categorical_list = ['Cabin', 'Embarked']
print(categorical_list)

['Cabin', 'Embarked']


In [9]:
for item in categorical_list:
    most_common = df_train[item].mode()[0]  
    df_train[item].fillna(most_common, inplace=True)  

In [10]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [11]:
numerical_list = ['Age']
for item in numerical_list:
    mean_value = df_train[item].mean()
    df_train[item].fillna(mean_value, inplace=True)
df_train.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [12]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Cabin        891 non-null    object 
 9   Embarked     891 non-null    object 
dtypes: float64(2), int64(5), object(3)
memory usage: 69.7+ KB


# Features Engineering Train Data

In [13]:
unique_sex_val = df_train['Sex'].unique()
print(unique_sex_val)
mapping_sex = {}
for i, value in enumerate(unique_sex_val): 
    mapping_sex[value] = i
print(mapping_sex)

['male' 'female']
{'male': 0, 'female': 1}


In [14]:
df_train['Sex'] = df_train['Sex'].map(mapping_sex)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,0,22.0,1,0,7.25,B96 B98,S
1,2,1,1,1,38.0,1,0,71.2833,C85,C
2,3,1,3,1,26.0,0,0,7.925,B96 B98,S
3,4,1,1,1,35.0,1,0,53.1,C123,S
4,5,0,3,0,35.0,0,0,8.05,B96 B98,S


In [15]:
unique_cabin_val = df_train['Cabin'].unique()
print(unique_cabin_val)
mapping_cabin = {}
for i, value in enumerate(unique_cabin_val): 
    mapping_cabin[value] = i
print(mapping_cabin)
df_train['Cabin'] = df_train['Cabin'].map(mapping_cabin).astype(int)
df_train.head()

['B96 B98' 'C85' 'C123' 'E46' 'G6' 'C103' 'D56' 'A6' 'C23 C25 C27' 'B78'
 'D33' 'B30' 'C52' 'B28' 'C83' 'F33' 'F G73' 'E31' 'A5' 'D10 D12' 'D26'
 'C110' 'B58 B60' 'E101' 'F E69' 'D47' 'B86' 'F2' 'C2' 'E33' 'B19' 'A7'
 'C49' 'F4' 'A32' 'B4' 'B80' 'A31' 'D36' 'D15' 'C93' 'C78' 'D35' 'C87'
 'B77' 'E67' 'B94' 'C125' 'C99' 'C118' 'D7' 'A19' 'B49' 'D' 'C22 C26'
 'C106' 'C65' 'E36' 'C54' 'B57 B59 B63 B66' 'C7' 'E34' 'C32' 'B18' 'C124'
 'C91' 'E40' 'T' 'C128' 'D37' 'B35' 'E50' 'C82' 'E10' 'E44' 'A34' 'C104'
 'C111' 'C92' 'E38' 'D21' 'E12' 'E63' 'A14' 'B37' 'C30' 'D20' 'B79' 'E25'
 'D46' 'B73' 'C95' 'B38' 'B39' 'B22' 'C86' 'C70' 'A16' 'C101' 'C68' 'A10'
 'E68' 'B41' 'A20' 'D19' 'D50' 'D9' 'A23' 'B50' 'A26' 'D48' 'E58' 'C126'
 'B71' 'B51 B53 B55' 'D49' 'B5' 'B20' 'F G63' 'C62 C64' 'E24' 'C90' 'C45'
 'E8' 'B101' 'D45' 'C46' 'D30' 'E121' 'D11' 'E77' 'F38' 'B3' 'D6'
 'B82 B84' 'D17' 'A36' 'B102' 'B69' 'E49' 'C47' 'D28' 'E17' 'A24' 'C50'
 'B42' 'C148']
{'B96 B98': 0, 'C85': 1, 'C123': 2, 'E46': 3, '

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,0,22.0,1,0,7.25,0,S
1,2,1,1,1,38.0,1,0,71.2833,1,C
2,3,1,3,1,26.0,0,0,7.925,0,S
3,4,1,1,1,35.0,1,0,53.1,2,S
4,5,0,3,0,35.0,0,0,8.05,0,S


In [16]:
unique_embarked_val = df_train['Embarked'].unique()
print(unique_embarked_val)
mapping_embarked = {}
for i, value in enumerate(unique_embarked_val): 
    mapping_embarked[value] = i
print(mapping_embarked)
df_train['Embarked'] = df_train['Embarked'].map(mapping_embarked)
df_train.head()

['S' 'C' 'Q']
{'S': 0, 'C': 1, 'Q': 2}


Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,0,3,0,22.0,1,0,7.25,0,0
1,2,1,1,1,38.0,1,0,71.2833,1,1
2,3,1,3,1,26.0,0,0,7.925,0,0
3,4,1,1,1,35.0,1,0,53.1,2,0
4,5,0,3,0,35.0,0,0,8.05,0,0


In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    int64  
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Cabin        891 non-null    int32  
 9   Embarked     891 non-null    int64  
dtypes: float64(2), int32(1), int64(7)
memory usage: 66.3 KB


# Cleaning Test Data

In [18]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [19]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [20]:
df_test.drop(columns=['Name'],inplace=True)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,330911,7.8292,,Q
1,893,3,female,47.0,1,0,363272,7.0,,S
2,894,2,male,62.0,0,0,240276,9.6875,,Q
3,895,3,male,27.0,0,0,315154,8.6625,,S
4,896,3,female,22.0,1,1,3101298,12.2875,,S


In [21]:
categorical_list = ['Cabin']
for item in categorical_list:
    most_common = df_test[item].mode()[0]  
    df_test[item].fillna(most_common, inplace=True)  
df_test.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Cabin           0
Embarked        0
dtype: int64

In [22]:
numerical_list = ['Age']
for item in numerical_list:
    mean_value = df_test[item].mean()
    df_test[item].fillna(mean_value, inplace=True)
df_test.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin          0
Embarked       0
dtype: int64

In [23]:
numerical_list = ['Fare']
for item in numerical_list:
    mean_value = df_test[item].mean()
    df_test[item].fillna(mean_value, inplace=True)
df_test.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [24]:
df_test.drop(columns = ['Ticket'], inplace = True)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,7.8292,B57 B59 B63 B66,Q
1,893,3,female,47.0,1,0,7.0,B57 B59 B63 B66,S
2,894,2,male,62.0,0,0,9.6875,B57 B59 B63 B66,Q
3,895,3,male,27.0,0,0,8.6625,B57 B59 B63 B66,S
4,896,3,female,22.0,1,1,12.2875,B57 B59 B63 B66,S


# Features Engineering Test Data

In [25]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    object 
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         418 non-null    float64
 7   Cabin        418 non-null    object 
 8   Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 29.5+ KB


In [26]:
unique_sex_val = df_test['Sex'].unique()
print(unique_sex_val)
mapping_sex = {}
for i, value in enumerate(unique_sex_val): 
    mapping_sex[value] = i
print(mapping_sex)

['male' 'female']
{'male': 0, 'female': 1}


In [27]:
df_test['Sex'] = df_test['Sex'].map(mapping_sex)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,0,34.5,0,0,7.8292,B57 B59 B63 B66,Q
1,893,3,1,47.0,1,0,7.0,B57 B59 B63 B66,S
2,894,2,0,62.0,0,0,9.6875,B57 B59 B63 B66,Q
3,895,3,0,27.0,0,0,8.6625,B57 B59 B63 B66,S
4,896,3,1,22.0,1,1,12.2875,B57 B59 B63 B66,S


In [28]:
unique_cabin_val = df_test['Cabin'].unique()
print(unique_cabin_val)
mapping_cabin = {}
for i, value in enumerate(unique_cabin_val): 
    mapping_cabin[value] = i
print(mapping_cabin)
df_test['Cabin'] = df_test['Cabin'].map(mapping_cabin)
df_test.head()

['B57 B59 B63 B66' 'B45' 'E31' 'B36' 'A21' 'C78' 'D34' 'D19' 'A9' 'D15'
 'C31' 'C23 C25 C27' 'F G63' 'B61' 'C53' 'D43' 'C130' 'C132' 'C101'
 'C55 C57' 'B71' 'C46' 'C116' 'F' 'A29' 'G6' 'C6' 'C28' 'C51' 'E46' 'C54'
 'C97' 'D22' 'B10' 'F4' 'E45' 'E52' 'D30' 'B58 B60' 'E34' 'C62 C64' 'A11'
 'B11' 'C80' 'F33' 'C85' 'D37' 'C86' 'D21' 'C89' 'F E46' 'A34' 'D' 'B26'
 'C22 C26' 'B69' 'C32' 'B78' 'F E57' 'F2' 'A18' 'C106' 'B51 B53 B55'
 'D10 D12' 'E60' 'E50' 'E39 E41' 'B52 B54 B56' 'C39' 'B24' 'D28' 'B41'
 'C7' 'D40' 'D38' 'C105']
{'B57 B59 B63 B66': 0, 'B45': 1, 'E31': 2, 'B36': 3, 'A21': 4, 'C78': 5, 'D34': 6, 'D19': 7, 'A9': 8, 'D15': 9, 'C31': 10, 'C23 C25 C27': 11, 'F G63': 12, 'B61': 13, 'C53': 14, 'D43': 15, 'C130': 16, 'C132': 17, 'C101': 18, 'C55 C57': 19, 'B71': 20, 'C46': 21, 'C116': 22, 'F': 23, 'A29': 24, 'G6': 25, 'C6': 26, 'C28': 27, 'C51': 28, 'E46': 29, 'C54': 30, 'C97': 31, 'D22': 32, 'B10': 33, 'F4': 34, 'E45': 35, 'E52': 36, 'D30': 37, 'B58 B60': 38, 'E34': 39, 'C62 C64': 40,

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,0,34.5,0,0,7.8292,0,Q
1,893,3,1,47.0,1,0,7.0,0,S
2,894,2,0,62.0,0,0,9.6875,0,Q
3,895,3,0,27.0,0,0,8.6625,0,S
4,896,3,1,22.0,1,1,12.2875,0,S


In [29]:
unique_embarked_val = df_test['Embarked'].unique()
print(unique_embarked_val)
mapping_embarked = {}
for i, value in enumerate(unique_embarked_val): 
    mapping_embarked[value] = i
print(mapping_embarked)
df_test['Embarked'] = df_test['Embarked'].map(mapping_embarked)
df_test.head()

['Q' 'S' 'C']
{'Q': 0, 'S': 1, 'C': 2}


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,892,3,0,34.5,0,0,7.8292,0,0
1,893,3,1,47.0,1,0,7.0,0,1
2,894,2,0,62.0,0,0,9.6875,0,0
3,895,3,0,27.0,0,0,8.6625,0,1
4,896,3,1,22.0,1,1,12.2875,0,1


In [30]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Sex          418 non-null    int64  
 3   Age          418 non-null    float64
 4   SibSp        418 non-null    int64  
 5   Parch        418 non-null    int64  
 6   Fare         418 non-null    float64
 7   Cabin        418 non-null    int64  
 8   Embarked     418 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 29.5 KB


# Training

In [31]:
X = df_train.drop('Survived', axis=1).values
y = df_train['Survived'].values

In [32]:
from sklearn.model_selection import train_test_split
# Set the random_state parameter to any integer. This will ensure reproducibility.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((712, 9), (712,), (179, 9), (179,))

In [33]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=350, max_depth=17, random_state=1)
model.fit(X_train, y_train)
model_predict_score = model.score(X_test,y_test)
print(f"Accuracy: {model_predict_score * 100}%")

Accuracy: 82.12290502793296%


# Prediction

In [34]:
model_pre = RandomForestClassifier(n_estimators=350, max_depth=10, random_state=1)
model_pre.fit(X_train, y_train)
predictions = model_pre.predict(df_test)

In [35]:
submission = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
submission.to_csv('submission.csv', index=False)
print("Your submission is saved successfully !")

Your submission is saved successfully !
