In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
sns.set_style('whitegrid')

In [2]:
train = pd.read_csv('train.csv')
train['source'] = 'train'

In [3]:
test = pd.read_csv('test.csv')
test['source'] = 'test'

In [4]:
passngerID = test[['PassengerId']]

In [5]:
data = pd.concat([train, test], axis= 0, sort= False)

# Data Exploring

In [None]:
data.head()

### Visualizing null values.

In [None]:
sns.heatmap(data.isnull(), yticklabels=False, cbar=False, cmap= 'viridis')

- Fare column has only one null value.<br/>
- Age column has many null values.<br/>
- Cabin column has a majority of null values.<br/>
- Survived column has null values for the test data.

In [None]:
data.info()

### Is data balanced?

In [None]:
sns.countplot(data = data, x= 'Survived')

### Which is the most survived gender?

In [None]:
sns.countplot(data = data, x= 'Survived', hue= 'Sex')
plt.legend(loc =(1.1,0.9)),

### Does first class have more survival rate?

In [None]:
sns.countplot(data = data, x='Survived', hue='Pclass')

### The distribution of passengers' age.

In [None]:
sns.distplot(data['Age'].dropna(), kde = False, bins = 35)

### The distribution of number of siblings.

In [None]:
sns.countplot(x = 'SibSp', data = data)

### Number of passenger's in each class.

In [None]:
sns.countplot(data= data.dropna(), x='Pclass')

### Proportion of each gender in different classes.

In [None]:
sns.countplot(data= data, x='Pclass', hue= 'Sex')

### Ticket fare for each class.

In [None]:
sns.boxplot(data= data.dropna(), x='Pclass', y= 'Fare')

In [None]:
data.describe()

# Data cleaning

### Fill missing values in Age with the median age for the corresponding class

In [8]:
class_mean_age = data.pivot_table(values='Age', index='Pclass', aggfunc='median')

In [9]:
null_age = data['Age'].isnull()

In [10]:
data.loc[null_age,'Age'] = data.loc[null_age,'Pclass'].apply(lambda x: class_mean_age.loc[x] )

In [11]:
data.Age.isnull().sum()

0

### Fill the missing value in Fare with the mean fare for the corresponding class.

In [12]:
class_mean_fare = data.pivot_table(values= 'Fare', index= 'Pclass', aggfunc='mean')

In [13]:
null_fare = data['Fare'].isnull()

In [14]:
data.loc[null_fare, 'Fare'] = data.loc[null_fare, 'Pclass'].apply(lambda x: class_mean_fare.loc[x] )

In [15]:
data.Fare.isnull().sum()

0

### Fill the missing values in Embarked with the most common port for corresponding class.

In [16]:
data['Embarked'] = data.Embarked.fillna('S')

In [17]:
data.Embarked.isnull().sum()

0

# Feature Engineering

### Create a new feature with the title of each passenger.

In [18]:
data['Title'] = data.Name.apply(lambda x : x[x.find(',')+2:x.find('.')])

### Drop unused columns from data.

In [19]:
data.drop(columns=['Name','Ticket', 'Cabin'], inplace= True)

### Convert categprical features (Embarked, Sex, Pclass) to numerical features and drop one dummy variable for each.

In [20]:
data = pd.get_dummies(data, columns=['Pclass','Embarked','Sex','Title'], drop_first=True)

## Predicting missing Age values using Random Forest Regressor.

In [None]:
# train_age = data.loc[data.Age.isnull() == False].reset_index(drop =True)
# test_age = data.loc[data.Age.isnull()].drop(columns = ['Age']).reset_index(drop =True)

### Normalizing 'Fare' feature values.

In [None]:
# sc = MinMaxScaler()

In [None]:
# train_age[['Fare','SibSp','Parch']] = sc.fit_transform(train_age[['Fare','SibSp','Parch']])
# test_age[['Fare','SibSp','Parch']] = sc.transform(test_age[['Fare','SibSp','Parch']])

In [None]:
# features = train_age.drop(columns=['PassengerId','Survived','Age','source'])
# response = train_age.Age

In [None]:
# rf = RandomForestRegressor(n_estimators= 1000)

In [None]:
# rf.fit(features,response)

In [None]:
# age_pred = pd.DataFrame(rf.predict(test_age.drop(columns=['PassengerId','Survived','source'])),dtype='int64' ,columns=['Age'])

In [None]:
# test_age_filled = pd.concat([age_pred, test_age], axis=1)

In [None]:
# filled_age = pd.concat([train_age, test_age_filled], axis=0, ignore_index=True)

In [None]:
# filled_age[['Fare','SibSp','Parch']] = sc.inverse_transform(filled_age[['Fare','SibSp','Parch']])

In [None]:
# data = filled_age

## Splitting Data back to train/test sets.

In [28]:
#Final train data
train = data[data.source == 'train'].drop(columns = ['PassengerId','source']).reset_index(drop=True)
test = data[data.source == 'test'].drop(columns = ['source','Survived']).reset_index(drop=True)

In [29]:
train['Survived'] = train.Survived.astype('int64')

### Normalizing numeric features (Age, SibSp, Parch, and Fare).

In [30]:
feature_to_normalize = ['Age','SibSp','Parch','Fare']

In [31]:
norm = {}
for i in feature_to_normalize:
    scaler = MinMaxScaler()
    scaler.fit(train[[i]])
    train[i] = scaler.transform(train[[i]])
    test[i] = scaler.transform(test[[i]])
    norm.update({i:scaler})

In [None]:
norm

### Exporting modified train/test data to external file.

In [32]:
#Final Test data
train.to_csv('train_modified.csv', index = False)
test.to_csv('test_modified.csv', index = False)