# **TITANIC MACHINE LEARNING FROM DISASTERS MODEL**

# Importing the libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# Importing the training dataset

In [2]:
dataset_train = pd.read_csv('/kaggle/input/titanic/train.csv')

In [3]:
dataset_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Creating a new type of variable from parch and sibsp

In [4]:
dataset_train['family_size'] = dataset_train['SibSp'] + dataset_train['Parch'] + 1 
dataset_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family_size
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1


In [5]:
dataset_train[['family_size', 'Survived']].groupby(['family_size'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,family_size,Survived
3,4,0.724138
2,3,0.578431
1,2,0.552795
6,7,0.333333
0,1,0.303538
4,5,0.2
5,6,0.136364
7,8,0.0
8,11,0.0


# Creating another new type from family_size

In [6]:
dataset_train['alone'] = 0
dataset_train.loc[dataset_train['family_size'] == 1, 'alone'] = 1
dataset_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family_size,alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,1,1


In [7]:
dataset_train[['alone', 'Survived']].groupby(['alone'], as_index=False).mean().sort_values(by='Survived', ascending=False)

Unnamed: 0,alone,Survived
0,0,0.50565
1,1,0.303538


In [8]:
X_train = dataset_train.iloc[:, [2,4,5,9,11,13]].values
y_train = dataset_train.iloc[:, 1].values

# Checking Missing values in our training dataset


In [9]:
print(dataset_train.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
family_size      0
alone            0
dtype: int64


# Inserting new Values at the place of missing data in training set

In [10]:
# For Age
imputer_1 = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_1.fit(X_train[:, [2]])
X_train[:, [2]] = imputer_1.transform(X_train[:, [2]])

# For Embarked
imputer_2 = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_2.fit(X_train[:, [4]])
X_train[:, [4]] = imputer_2.transform(X_train[:, [4]])

# Encoding categorical data in training set

In [11]:
# Encoding P Class
ct_1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X_train = np.array(ct_1.fit_transform(X_train))
X_train = X_train[: ,1:]

# Encoding Embarked
ct_2 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [5])], remainder='passthrough')
X_train = np.array(ct_2.fit_transform(X_train))
X_train = X_train[: ,[0,1,3,4,5,6,7,8]]

# Encoding Gender
le_train = LabelEncoder()
X_train[:, 4] = le_train.fit_transform(X_train[:, 4])

# Now Doing the above whole preprocessing on our test dataset

# Importing the test dataset

In [12]:
dataset_test= pd.read_csv('/kaggle/input/titanic/test.csv')

In [13]:
dataset_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [14]:
dataset_test['family_size'] = dataset_test['SibSp'] + dataset_test['Parch'] + 1 
dataset_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family_size
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,3


In [15]:
dataset_test['alone'] = 0
dataset_test.loc[dataset_train['family_size'] == 1, 'alone'] = 1
dataset_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,family_size,alone
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,1,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,2,0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,1,1
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,1,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,3,1


In [16]:
X_test = dataset_test.iloc[:, [1,3,4,8,10,12]].values

# Checking Missing values in our test dataset

In [17]:
print(dataset_test.isnull().sum())

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
family_size      0
alone            0
dtype: int64


# Inserting new Values at the place of missing data in test set

In [18]:
# For Age
imputer_3 = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_3.fit(X_test[:, [2]])
X_test[:, [2]] = imputer_3.transform(X_test[:, [2]])

# For Fare
imputer_4 = SimpleImputer(missing_values=np.nan, strategy='median')
imputer_4.fit(X_test[:, [3]])
X_test[:, [3]] = imputer_4.transform(X_test[:, [3]])

# Encoding categorical data in test dataset

In [19]:
# Encoding P Class
ct_3 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X_test = np.array(ct_3.fit_transform(X_test))
X_test = X_test[: ,1:]

# Encoding Embarked
ct_4 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [5])], remainder='passthrough')
X_test = np.array(ct_4.fit_transform(X_test))
X_test = X_test[: ,[0,1,3,4,5,6,7,8]]

# Encoding Gender
le_test = LabelEncoder()
X_test[:, 4] = le_test.fit_transform(X_test[:, 4])

# Splitting the dataset into the Training set and Test set

In [20]:
from sklearn.model_selection import train_test_split
X_1, X_2, y_1, y_2 = train_test_split(X_train, y_train, test_size = 0.20)

# Applying Feature Scaling

In [21]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_1[:, [5,6]] = sc_X.fit_transform(X_1[:, [5,6]])
X_2[:, [5,6]] = sc_X.transform(X_2[:, [5,6]])
X_test[:, [5,6]] = sc_X.transform(X_test[:, [5,6]])

 # Now traing our Machine learning model on training dataset and fitting it over test data set to predict survival


In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')
classifier.fit(X_1,y_1)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

# Prediction for Training Set

In [23]:
y_pred_train = classifier.predict(X_2)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

print('Confusion Matrix :')
print(confusion_matrix(y_2, y_pred_train)) 
print('Accuracy Score :',accuracy_score(y_2, y_pred_train))
print('Report : ')
print(classification_report(y_2, y_pred_train))

Confusion Matrix :
[[93 15]
 [15 56]]
Accuracy Score : 0.8324022346368715
Report : 
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       108
           1       0.79      0.79      0.79        71

    accuracy                           0.83       179
   macro avg       0.82      0.82      0.82       179
weighted avg       0.83      0.83      0.83       179



# Prediction for test Set

In [24]:
y_pred_test = classifier.predict(X_test)

output = pd.DataFrame({'PassengerId': dataset_test.PassengerId, 'Survived': y_pred_test})
output.to_csv('my_submission_4.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
