# Titanic Challenge (Kaggle)
Predict survival on the Titanic and get familiar with ML.



## Importing the required libraries

In [0]:
# Upload files from the computer to Colab
from google.colab import files

import pandas as pd
import numpy as np

# Import classification modules from ScikitLearn library
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

## Uploading and reading the given datasets *( train.csv and test.csv )*

In [0]:
train_upload = files.upload()
test_upload = files.upload()

In [0]:
titanic_train = pd.read_csv('train.csv', sep = ',', names=['passengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'], header=0)
titanic_test = pd.read_csv('test.csv', sep = ',', names=['passengerId','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'], header=0)

## Data description

In [0]:
# Dataset dimentions
titanic_train.shape

In [0]:
# First n samples on the dataset
titanic_train.head()

In [0]:
# Descriptive statistics
titanic_train.describe()

In [0]:
# Visualizing null data on dataset
print(titanic_train.isnull().sum())
print(" ")
print(titanic_test.isnull().sum())

## Data Pre-processing

In [0]:
### Fill NaN Age data with the mean value

age_mean_train = titanic_train['Age'].mean()
age_mean_test = titanic_test['Age'].mean()

titanic_train['Age'] = titanic_train['Age'].replace(np.nan,age_mean_train)
titanic_test['Age'] = titanic_test['Age'].replace(np.nan,age_mean_test)

### Delete unusefull columns and Cabin because the data absense

titanic_train = titanic_train.drop(['passengerId','Name','Ticket','Cabin','Fare'], axis = 1)
titanic_test = titanic_test.drop(['Name','Ticket','Cabin','Fare'], axis = 1)

### Convert variables

titanic_train['Sex'].replace(['female','male'],[0,1], inplace = True)
titanic_test['Sex'].replace(['female','male'],[0,1], inplace = True)

titanic_train['Embarked'].replace(['Q','S','C'],[0,1,2], inplace = True)
titanic_test['Embarked'].replace(['Q','S','C'],[0,1,2], inplace = True)

### Delete rows with missing data

titanic_train.dropna(axis = 0, how = 'any', inplace = True)
titanic_test.dropna(axis = 0, how = 'any', inplace = True)

### Download dataframes with train and test preprocessed data

In [0]:
df_titanic_train = pd.DataFrame(titanic_train)
df_titanic_test = pd.DataFrame(titanic_test)

df_titanic_train.to_csv('train_preprocessed.csv', index=True, header=True, sep=';',decimal='.')
df_titanic_test.to_csv('test_preprocessed.csv', index=True, header=True, sep=';',decimal='.')

## Split data for sklearn algorythms

In [0]:
## X axis will contain the entry data and y with Exit data

X = np.array(titanic_train.drop(['Survived'],1))
y = np.array(titanic_train['Survived'])

# Create four variables that will receive their values ( in the respective order that the function returns ) from the function train_test_split
# 25% of the train data becomes the test sample, and 75% will be used to train the model.

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 12)

## Classification Methods

In [0]:
# Create a dictionary to visualize the score of the used clasification methods
scoring_methods = {}

### Logistic Regression

In [0]:
log_regression = LogisticRegression()
log_regression.fit(X_train,y_train)
y_prediction = log_regression.predict(X_test)

scoring_methods['Logistic Regression'] = log_regression.score(X_test,y_test)

### Support Vector Machines

In [0]:
svc = SVC()

parameters = {
    
    'C': [0.001,0.01,0.1,1,10,100],
    'kernel': ('linear','rbf')
    
}

clf = GridSearchCV(svc, parameters, cv=5)

clf.fit(X_train,y_train)

#y_prediction = clf.predict(X_test)

scoring_methods['Support Vector Machines'] = clf.score(X_test,y_test)

In [0]:
clf.best_estimator_

In [0]:
svc.get_params

In [0]:
print(clf.score(X_train,y_train))

In [0]:
scoring_methods

### K-nearest Neighbors

In [0]:
# Creating a function to iterate multiple k values and determinate what is the one that results in the highest score
def kscoring(kp):
  
  knn = KNeighborsClassifier(n_neighbors = kp)
  knn.fit(X_train,y_train)
  y_predict = (knn.predict(X_test))
               
  return knn.score(X_test,y_test)

In [0]:
# Create a list with values to iterate the k possible values
k_possible = list(filter((lambda x: x%2!=0),range(3,103)))
k_score = []


# Iterating the k possible values list               
for ksc in k_possible:
  
  k_score.append(kscoring(ksc))

print("The highest score obtained by Knn is {kscore}, using a k value of {kvalue}".format(kscore = max(k_score), kvalue = k_possible[k_score.index(max(k_score))]))

scoring_methods['K-nearest neighbors'] = max(k_score)

In [0]:
# Showing the scoring of the used classification methods
print(scoring_methods)

## Predictions using Support Vector Machines method

Choosen method to get the survival predictions. This is the method with the highest test scoring.

In [0]:
id_survival = titanic_test['passengerId']

log_regression_prediction = log_regression.predict(titanic_test.drop('passengerId', axis = 1))
log_regression_prediction_dataframe = pd.DataFrame({'passengerId':id_survival , 'Survived': log_regression_prediction})

log_regression_prediction_dataframe.shape

In [0]:
# Create a Dataframe with the obtained predictions
log_regression_prediction_dataframe.to_csv('titanic_submissions.csv', index=True, header=True, sep=';',decimal='.')