# Titanic Competition on Kaggle

### Import libraries and dataset

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

data = pd.read_csv("train.csv",index_col='PassengerId')
test_data = pd.read_csv("test.csv",index_col='PassengerId')
data

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Drop Name and Ticket Columns

In [2]:
# Drop Columns
columns_to_delete = ['Name','Ticket']
data = data.drop(columns_to_delete,axis=1)
test_data = test_data.drop(columns_to_delete,axis=1)
data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,male,22.0,1,0,7.2500,,S
2,1,1,female,38.0,1,0,71.2833,C85,C
3,1,3,female,26.0,0,0,7.9250,,S
4,1,1,female,35.0,1,0,53.1000,C123,S
5,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,,S
888,1,1,female,19.0,0,0,30.0000,B42,S
889,0,3,female,,1,2,23.4500,,S
890,1,1,male,26.0,0,0,30.0000,C148,C


### Show the number of nan values in each columns

In [3]:
data.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

### Remove Cabin column as it has many nan values

In [4]:
data = data.drop(['Cabin'],axis=1)
test_data = test_data.drop(['Cabin'],axis=1)
data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,1,0,7.2500,S
2,1,1,female,38.0,1,0,71.2833,C
3,1,3,female,26.0,0,0,7.9250,S
4,1,1,female,35.0,1,0,53.1000,S
5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S
888,1,1,female,19.0,0,0,30.0000,S
889,0,3,female,,1,2,23.4500,S
890,1,1,male,26.0,0,0,30.0000,C


### Fill nan values in dataset with the mean of the column

In [5]:
data.fillna(data.mean(),inplace=True)
test_data.fillna(data.mean(),inplace=True)

### Round the age column values

In [6]:
data['Age'] = data['Age'].round()
test_data['Age'] = test_data['Age'].round()

### Fill nan values in Embarked column with the mode

In [7]:
data['Embarked'].fillna(data.Embarked.mode()[0],inplace=True)
test_data['Embarked'].fillna(test_data.Embarked.mode()[0],inplace=True)

### Encode Sex column then add it to the dataframe

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Sex'])], remainder='passthrough')

X = np.array(ct.fit_transform(test_data))
test_data.insert(2,'Male',X[:,0])
test_data.insert(3,'Female',X[:,1])
test_data = test_data.drop(['Sex'],axis=1)

X = np.array(ct.fit_transform(data))

data.insert(3,'Male',X[:,0])
data.insert(4,'Female',X[:,1])
data = data.drop(['Sex'],axis=1)
data

Unnamed: 0_level_0,Survived,Pclass,Male,Female,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,0,1,22.0,1,0,7.2500,S
2,1,1,1,0,38.0,1,0,71.2833,C
3,1,3,1,0,26.0,0,0,7.9250,S
4,1,1,1,0,35.0,1,0,53.1000,S
5,0,3,0,1,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
887,0,2,0,1,27.0,0,0,13.0000,S
888,1,1,1,0,19.0,0,0,30.0000,S
889,0,3,1,0,30.0,1,2,23.4500,S
890,1,1,0,1,26.0,0,0,30.0000,C


### Encode Embarked column then add it to the dataframe

In [9]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['Embarked'])], remainder='passthrough')

X = np.array(ct.fit_transform(test_data))

test_data.insert(len(test_data.columns),'C',X[:,0])
test_data.insert(len(test_data.columns),'Q',X[:,1])
test_data.insert(len(test_data.columns),'S',X[:,2])
test_data = test_data.drop(['Embarked'],axis=1)

X = np.array(ct.fit_transform(data))

data.insert(len(data.columns),'C',X[:,0])
data.insert(len(data.columns),'Q',X[:,1])
data.insert(len(data.columns),'S',X[:,2])
data = data.drop(['Embarked'],axis=1)
data

Unnamed: 0_level_0,Survived,Pclass,Male,Female,Age,SibSp,Parch,Fare,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,0,1,22.0,1,0,7.2500,0,0,1
2,1,1,1,0,38.0,1,0,71.2833,1,0,0
3,1,3,1,0,26.0,0,0,7.9250,0,0,1
4,1,1,1,0,35.0,1,0,53.1000,0,0,1
5,0,3,0,1,35.0,0,0,8.0500,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,0,1,27.0,0,0,13.0000,0,0,1
888,1,1,1,0,19.0,0,0,30.0000,0,0,1
889,0,3,1,0,30.0,1,2,23.4500,0,0,1
890,1,1,0,1,26.0,0,0,30.0000,1,0,0


### Split dataset into train and test data

In [10]:
X = data.iloc[:,1:]
y = data.iloc[:,0]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Train the Kmeans clustering model on the training set

In [11]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 0)
y_predict = kmeans.fit_predict(X_train)

### Get accuracy score of the Kmeans clustering model after training

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_predict)

0.6348314606741573

## Train the Hierarical Agglomerative clustering model on the training set

In [13]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 2, affinity = 'euclidean', linkage = 'ward')
y_predict = hc.fit_predict(X_train)

### Get accuracy score of the Kmeans clustering model after training

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_predict)

0.6235955056179775

## Train the Random Forest classification model on the training set

In [15]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 50, criterion = 'entropy', random_state = 0,max_features=2)
clf = classifier.fit(X_train, y_train)

### Get accuracy score of the Random Forest clustering model after training

In [16]:
y_predict = clf.predict(X_train)
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_predict)

0.976123595505618

In [17]:
accuracy_score(y_test,clf.predict(X_test))
# Conclusion, model has high accuracy on the training data so it means that it is overfitted

0.8435754189944135

## Train the Naive Bayes classification model on the training set

In [18]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
clf = classifier.fit(X_train, y_train)

### Get accuracy score of the Naive Bayes classification model after training

In [19]:
y_predict = clf.predict(X_train)
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_predict)

0.7893258426966292

In [20]:
y_predict = clf.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_predict)

0.7988826815642458

In [21]:
from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[91, 19],
       [17, 52]], dtype=int64)

# Generate Output CSV File

In [21]:
y_predict = clf.predict(test_data)
df = pd.DataFrame(y_predict,columns=['Survived'])
df.index = test_data.index
df.to_csv("output.csv")