# Titanic Challenge (Kaggle)

In [0]:
from google.colab import files
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [118]:
train_upload = files.upload()
test_upload = files.upload()

Saving train.csv to train (1).csv


Saving test.csv to test (1).csv


In [0]:
titanic_train = pd.read_csv('train.csv', sep = ',', names=['passengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'], header=0)
titanic_test = pd.read_csv('test.csv', sep = ',', names=['passengerId','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'], header=0)

In [0]:
titanic_train.head()

In [0]:
titanic_train.shape

In [0]:
titanic_train.describe()

In [0]:
col_names = titanic_train.columns.tolist()

In [0]:
print(titanic_train.isnull().sum())

print(" ")

print(titanic_test.isnull().sum())

## Data Pre-processing

In [0]:
### Fill NaN Age data with the mean value

age_mean_train = titanic_train['Age'].mean()
age_mean_test = titanic_test['Age'].mean()

titanic_train['Age'] = titanic_train['Age'].replace(np.nan,age_mean_train)
titanic_test['Age'] = titanic_test['Age'].replace(np.nan,age_mean_test)

### Delete unusefull columns and Cabin because the data absense

titanic_train = titanic_train.drop(['passengerId','Name','Ticket','Cabin'], axis = 1)
titanic_test = titanic_test.drop(['Name','Ticket','Cabin',], axis = 1)

### Convert variables

titanic_train['Sex'].replace(['female','male'],[0,1], inplace = True)
titanic_test['Sex'].replace(['female','male'],[0,1], inplace = True)

titanic_train['Embarked'].replace(['Q','S','C'],[0,1,2], inplace = True)
titanic_test['Embarked'].replace(['Q','S','C'],[0,1,2], inplace = True)

### Delete rows with missing data

titanic_train.dropna(axis = 0, how = 'any', inplace = True)
titanic_test.dropna(axis = 0, how = 'any', inplace = True)

### Download dataframes with train and test preprocessed data

In [0]:
df_titanic_train = pd.DataFrame(titanic_train)
df_titanic_test = pd.DataFrame(titanic_test)

df_titanic_train.to_csv('train_preprocessed.csv', index=True, header=True, sep=';',decimal='.')
df_titanic_test.to_csv('test_preprocessed.csv', index=True, header=True, sep=';',decimal='.')

## Split data for sklearn algorythms

In [0]:
### X axis will contain the entry data and y with Exit data

X = np.array(titanic_train.drop(['Survived'],1))
y = np.array(titanic_train['Survived'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25)

## Classification Methods

In [0]:
scoring_methods = {}

In [0]:
print(scoring_methods)

### Logistic Regression

In [125]:
log_regression = LogisticRegression()
log_regression.fit(X_train,y_train)
y_prediction = log_regression.predict(X_test)

scoring_methods['Logistic Regression'] = log_regression.score(X_train,y_train)



### Support Vector Machines

In [0]:
svc = SVC()
svc.fit(X_train,y_train)
y_prediction = svc.predict(X_test)

scoring_methods['Support Vector Machines'] = svc.score(X_train,y_train)

### K-nearest Neighbors

In [0]:
def kscoring(kp):
  
  knn = KNeighborsClassifier(n_neighbors = kp)
  knn.fit(X_train,y_train)
  y_predict = (knn.predict(X_test))
               
  return knn.score(X_train,y_train)

In [0]:
k_possible = list(filter((lambda x: x%2!=0),range(3,203)))
k_score = []
               
for ksc in k_possible:
  
  k_score.append(kscoring(ksc))

'''print("The highest score obtained by Knn is {kscore}, using a k value of {kvalue}".format(kscore = max(k_score), kvalue = k_possible[k_score.index(max(k_score))]))'''

scoring_methods['K-nearest neighbors'] = max(k_score)

## Predictions using Support Vector Machines method

Choosen method to get the survival predictions. Choosen thanks to its highest score test, over the Logistic regression and K-nearest Neighbors

In [0]:
id_survival = titanic_test['passengerId']

svc_prediction = svc.predict(titanic_test.drop('passengerId', axis = 1))
svc_prediction_dataframe = pd.DataFrame({'passengerId':id_survival , 'Survived': svc_prediction})

svc_prediction_dataframe.shape

In [0]:
svc_prediction_dataframe.to_csv('titanic_submissions.csv', index=True, header=True, sep=';',decimal='.')