# Importing Libraries

In [1]:
import pandas as pd#for data manipulation
import matplotlib.pyplot as plt # to plot graphs
import numpy as np #
import seaborn as sns

from sklearn import linear_model #for regression
from sklearn.ensemble import RandomForestClassifier #ensemble model
from sklearn import tree
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler, LabelEncoder #preprocessing
from sklearn.model_selection import train_test_split #training and testing

from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import RidgeCV, LassoCV


import warnings
warnings.filterwarnings("ignore")

## Data Preparation

In [2]:
# loading the datasets
titanic_train = pd.read_csv("train.csv")
titanic_test = pd.read_csv("test.csv")
gender_data = pd.read_csv("gender_submission.csv")

# extract the necessary columns (age, sex, pclass and survived)
titanic_train.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [3]:
gender_data

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [4]:
titanic_train["Age"]

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [5]:
titanic_test.info()
# titanic_test = titanic_test[["Age", "Sex", "Pclass", "Survived"]]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [6]:
# extract the necessary columns (age, sex, pclass and survived)
titanic_train = titanic_train[["Age", "Sex", "Pclass", "Survived"]]

titanic_test = titanic_test [["Age", "Sex", "Pclass", "PassengerId"]]


In [7]:
# titanic_train["Age"]

In [8]:
#using mean age value to fill the missing calues in the age column
titanic_train['Age'] = titanic_train['Age'].fillna(titanic_train.Age.mean())
titanic_test['Age'] = titanic_test['Age'].fillna(titanic_test.Age.mean())

## Transforming categorical features (especially sex) to numerical

In [9]:
# Transforming categorical features (especially sex) to numerical
genderEncodedLabels=LabelEncoder() 
gender=genderEncodedLabels.fit_transform(titanic_train["Sex"])
gender[:]
titanic_train['Sex']=gender

In [10]:
# Transforming categorical features (especially sex) to numerical
genderEncodedLabels=LabelEncoder() 
gender=genderEncodedLabels.fit_transform(titanic_test["Sex"])
gender[:]
titanic_test['Sex']=gender

In [11]:
titanic_test

Unnamed: 0,Age,Sex,Pclass,PassengerId
0,34.50000,1,3,892
1,47.00000,0,3,893
2,62.00000,1,2,894
3,27.00000,1,3,895
4,22.00000,0,3,896
...,...,...,...,...
413,30.27259,1,3,1305
414,39.00000,0,1,1306
415,38.50000,1,3,1307
416,30.27259,1,3,1308


In [12]:
np.arange(1,20)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])

In [13]:
# Merging test data with gender data
test_df = titanic_test.merge(gender_data, on="PassengerId", how="right")
test_df

Unnamed: 0,Age,Sex,Pclass,PassengerId,Survived
0,34.50000,1,3,892,0
1,47.00000,0,3,893,1
2,62.00000,1,2,894,0
3,27.00000,1,3,895,0
4,22.00000,0,3,896,1
...,...,...,...,...,...
413,30.27259,1,3,1305,0
414,39.00000,0,1,1306,1
415,38.50000,1,3,1307,0
416,30.27259,1,3,1308,0


## Building Random Forest model

In [14]:
X_train = titanic_train.iloc[:,:-1] #independent variables
y_train = titanic_train.iloc[:,-1:] #dependent variable

In [15]:
# X_test

In [16]:
# Looking for the optimal estimators
random_forest = RandomForestClassifier(random_state=1)
n = np.arange(1,30)
param_grid = {'n_estimators' : n}

rf_cv = GridSearchCV(random_forest, param_grid=param_grid, cv=5)

rf_cv.fit(X_train, y_train)
print('Best value of n_estimators:',rf_cv.best_params_)
print('Best score:',rf_cv.best_score_*100)

Best value of n_estimators: {'n_estimators': 27}
Best score: 80.92084614901763


In [17]:
titanic_test

Unnamed: 0,Age,Sex,Pclass,PassengerId
0,34.50000,1,3,892
1,47.00000,0,3,893
2,62.00000,1,2,894
3,27.00000,1,3,895
4,22.00000,0,3,896
...,...,...,...,...
413,30.27259,1,3,1305
414,39.00000,0,1,1306
415,38.50000,1,3,1307
416,30.27259,1,3,1308


In [18]:
X_test = test_df.iloc[:,:-2] #independent variables
y_test = test_df.iloc[:,-1:] #dependent variable

In [19]:
X_test

Unnamed: 0,Age,Sex,Pclass
0,34.50000,1,3
1,47.00000,0,3
2,62.00000,1,2
3,27.00000,1,3
4,22.00000,0,3
...,...,...,...
413,30.27259,1,3
414,39.00000,0,1
415,38.50000,1,3
416,30.27259,1,3


In [20]:
y_test

Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0


In [21]:
# Fitting the model with the optimal estimator
rfg = RandomForestClassifier(n_estimators=27, random_state=1)
rfg.fit(X_train,y_train)
rfg_pred = rfg.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, rfg_pred))

Mean Squared Error: 0.1937799043062201


In [22]:
# accuracy score
print("score:", accuracy_score(y_test, rfg_pred))

score: 0.80622009569378


In [23]:
# survival prediction
rfg_pred 

array([0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [24]:
test_df['PassengerId']

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [25]:
# creating the results dataframe
results = pd.DataFrame({
        "PassengerId": test_df['PassengerId'],
        "Survived": rfg_pred
    })
results

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,1
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [26]:
# saving the dataframe to a csv file
results.to_csv('titanic.csv', index=False)