<h1 style='text-align:center'>Titanic - Machine Learning from Disaster.</h1>
<br>

![](http://media.giphy.com/media/1Nk9bIidJVTy0/giphy.gif)

Objective: To predict whether a passenger survive or not, based on the other features

In [None]:
### Importing Packages

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#import train and test data
train=pd.read_csv('data/train.csv')
train.head()

In [None]:
train.info()

In [None]:
#### Percentage of Missing Values
np.round(train.isnull().sum()/len(train) * 100,0)

In [None]:
# Remove the unused columns - We also remove cabin since it has more than 75% missing values
train.drop(['Ticket','Name', 'Embarked', 'SibSp', 'Parch', 'Cabin'],axis=1,inplace=True)
train.set_index('PassengerId', inplace=True)
train.head()

In [None]:
train.info()

In [None]:
### Survived Distribution
np.round(train.Survived.value_counts(normalize=True)*100,0)

In [None]:
train.describe()

In [None]:
train.Age.isna().sum()

In [None]:
#### Store Age Median into a variable
age_median = train.Age.median()
age_median

In [None]:
def age_missing_fill(X):
    if np.isnan(X):
        return age_median
    else:
        return X

In [None]:
train['Age'] = train.Age.apply(age_missing_fill)

In [None]:
pd.crosstab(train.Sex, train.Survived)

In [None]:
train.boxplot(column=['Fare'])
plt.show()

#### Detecting Outliers

In [None]:
q75, q25 = np.percentile(train.Fare, [75,25])
iqr = q75 - q25
upper_whisker = q75 + 1.5 * iqr
lower_whisker = q25 - 1.5 * iqr

In [None]:
def check(x, ul, ll):
    if ul>=x>=ll:
        return x

In [None]:
print("Percentage of Outliers in Fare:",len(train[train.Fare.apply(check, args = (upper_whisker, lower_whisker)).isnull()]['Fare'])/len(train) * 100)

# Quiz 5) How to Fill the outliers here??

In [None]:
train.Fare.describe()

In [None]:
train.groupby('Pclass')['Fare'].mean()

In [None]:
def transform_fare(x):
    if x > upper_whisker:
        return upper_whisker
    return x

train['Fare'] = train['Fare'].apply(lambda x: transform_fare(x))
train['Fare'].plot.hist(bins=100, title='Frequency distribution of mean transformed Trip duration');

In [None]:
train.Fare.describe()

#### Heat Map

In [None]:
plt.subplots(figsize = (15,8))
sns.heatmap(train.corr(), annot=True,cmap="PiYG")
plt.title("Correlations Among Features", fontsize = 20);

In [None]:
train = pd.get_dummies(train)

In [None]:
train.head()

In [None]:
X = train.iloc[:, 1:].values
y = train.iloc[:, 0].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 45)

In [None]:
print("Shape of X_train", X_train.shape)
print("Shape of X_test", X_test.shape)
print("Shape of y_train", y_train.shape)
print("Shape of y_test", y_test.shape)

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

In [None]:
print("Accuracy of the model:", accuracy_score(y_test, y_pred))

In [None]:
np.bincount(y_test)

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
print(classification_report(y_test, y_pred))