In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix , accuracy_score
from imblearn.over_sampling import SMOTE
import numpy as np

In [5]:
titanic = pd.read_csv('titanic_train.csv')


In [6]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
1,734,0,2,"Berriman, Mr. William John",male,23.0,0,0,28425,13.0,,S
2,383,0,3,"Tikkanen, Mr. Juho",male,32.0,0,0,STON/O 2. 3101293,7.925,,S
3,705,0,3,"Hansen, Mr. Henrik Juul",male,26.0,1,0,350025,7.8542,,S
4,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,,S


In [7]:
titanic['Embarked'].value_counts()

Embarked
S    525
C    125
Q     60
Name: count, dtype: int64

In [8]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            140
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          553
Embarked         2
dtype: int64

In [9]:
titanic['Cabin'].value_counts()

Cabin
C23 C25 C27    4
C22 C26        3
F2             3
G6             3
B96 B98        3
              ..
A14            1
D49            1
C87            1
D56            1
C62 C64        1
Name: count, Length: 117, dtype: int64

Fill the NULL values

In [10]:
titanic['Age'].fillna(titanic['Age'].median(), inplace=True)
titanic['Embarked'].fillna(titanic['Embarked'].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  titanic['Age'].fillna(titanic['Age'].median(), inplace=True)


In [11]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          553
Embarked         0
dtype: int64

In [12]:
titanic['Sex'] = titanic['Sex'].map({'male':0 , 'female':1})
titanic['Embarked'] = titanic['Embarked'].map({'C':0 , 'Q':1, 'S':2})


In [13]:
titanic['FamilySize'] = titanic['SibSp'] + titanic['Parch'] + 1
titanic['Isalone'] = (titanic['FamilySize'] == 1).astype(int) #This creates a Boolean Series where each value is: True-> 1, False-> 0
titanic['HasCabin'] = titanic['Cabin'].notnull().astype(int) #This creates a Boolean Series where each value is: True-> 1, False-> 0
titanic['Title'] = titanic['Name'].str.extract(' ([A-Za-z]+)\.', expand=False).map({
    'Mr': 0,
    'Miss': 1,
    'Mrs': 2,
    'Master': 3,
    'Rare':4
}).fillna(4) #This creates a new column 'Title' with the extracted title from the name
titanic['Pclass_fair'] = titanic['Pclass'] * titanic['Fare']
titanic['Age_fair'] = titanic['Age'] * titanic['Fare']



In [14]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,Isalone,HasCabin,Title,Pclass_fair,Age_fair
0,332,0,1,"Partner, Mr. Austen",0,45.5,0,0,113043,28.5,C124,2,1,1,1,0.0,28.5,1296.75
1,734,0,2,"Berriman, Mr. William John",0,23.0,0,0,28425,13.0,,2,1,1,0,0.0,26.0,299.0
2,383,0,3,"Tikkanen, Mr. Juho",0,32.0,0,0,STON/O 2. 3101293,7.925,,2,1,1,0,0.0,23.775,253.6
3,705,0,3,"Hansen, Mr. Henrik Juul",0,26.0,1,0,350025,7.8542,,2,2,0,0,0.0,23.5626,204.2092
4,814,0,3,"Andersson, Miss. Ebba Iris Alfrida",1,6.0,4,2,347082,31.275,,2,7,0,0,1.0,93.825,187.65


In [16]:
x = titanic.drop(['Survived', 'PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
y = titanic['Survived']

In [17]:
titanic['Survived'].value_counts()

Survived
0    444
1    268
Name: count, dtype: int64

In [18]:
smote = SMOTE(random_state=42)
x_resampled, y_resampled = smote.fit_resample(x, y)
x_train, x_test, y_train, y_test = train_test_split(x_resampled, y_resampled, test_size=0.2, random_state=42)




In [23]:
y_resampled.value_counts()

Survived
0    444
1    444
Name: count, dtype: int64

In [2]:
from sklearn.model_selection import RandomizedSearchCV
params_distributions = {
                'n_distributions': [100,200,300],
                'max_depth' : [10,20,30],
                'min_samples_split' : [2,5],
                'min_sample_leaf':[1,2]
            }

In [None]:
rf = RandomForestClassifier(random_state=42)
random_search = RandomizedSearchCV(rf,params_distributions , n_iter=10,cv=3,scoring='accuracy')
random_search.fit(x_train,y_train)

In [28]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
importances = rf.feature_importances_
print(accuracy_score(y_test, y_pred) * 100)


[[70 14]
 [13 81]]
              precision    recall  f1-score   support

           0       0.84      0.83      0.84        84
           1       0.85      0.86      0.86        94

    accuracy                           0.85       178
   macro avg       0.85      0.85      0.85       178
weighted avg       0.85      0.85      0.85       178

84.8314606741573
