# Titanic Dataset Problem

In [None]:
### import main packages

import pandas as pd
import numpy as np
from scipy.stats import skew

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_df = pd.read_csv('train.csv')
test_df  = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)

In [None]:
train_df.describe()

In [None]:
test_df.describe()

##### Fare has a lot of outliers

In [None]:
sns.countplot(train_df['Survived'])
plt.title('Target Count')
plt.xlabel('Survived')
plt.ylabel('Frequency of each target category')

In [None]:
train_df.hist(figsize=(20,10))

In [None]:
### missing values:
round(train_df.isnull().sum()/train_df.shape[0]*100, 2)

In [None]:
### missing values:
round(test_df.isnull().sum()/train_df.shape[0]*100, 2)

#### can drop 'Cabin' becoz it has missing values >= 70%
will handle "Embarked" and 'Age' by filling with mean

In [None]:
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].mean())
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())
test_df['Fare'] = test_df['Fare'].fillna(test_df['Fare'].mean())

In [None]:
sns.countplot(train_df['Embarked'])

In [None]:
train_df['Embarked'].unique()
train_df['Embarked'] = train_df['Embarked'].replace(np.nan, 'S')

In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
train_df.drop(['Cabin'], axis = 1, inplace = True)
test_df.drop(['Cabin'], axis = 1, inplace = True)

# Normality Check

In [None]:
from scipy.stats import skew

In [None]:
numerical_features   = train_df.select_dtypes(include=np.number).columns
categorical_features = train_df.select_dtypes(include=np.object).columns
print(numerical_features)
print(categorical_features)

In [None]:
skew(train_df[numerical_features], nan_policy='omit') ## find out threshold for skewness

In [None]:
sns.distplot(train_df.Fare)

In [None]:
sns.distplot(np.log1p(train_df.Fare)) # log1p -> adds 1 to entire data and then takes log

In [None]:
train_df.Fare = np.log1p(train_df.Fare) # taking log transformation on Fare
test_df.Fare = np.log1p(test_df.Fare)

In [None]:
sns.distplot(train_df.Age)
# since age looks balanced, no need to handle for normality 

In [None]:
sns.distplot(test_df.Age)

categorical_features 
Index(['Name', 'Sex', 'Ticket', 'Embarked'], dtype='object')

 - drop 'Name', 'Ticket' becoz name is nominal features
 - 'Sex' - handle it replace male with '1' and female with '0'
 - 'Embarked' - handle it but how?
 
 
 Types of encoding
   - Label encoding Ex: {'excellent':3,'good':2, 'bad':1}
   - One hot encoding (OHE) Ex: {'male':1 , 'female:'0}: {'male':001 , 'female':010, 'other':100}

In [None]:
train_df.drop(['Name','Ticket'], axis = 1, inplace = True)
test_df.drop(['Name','Ticket'], axis = 1,inplace = True)

In [None]:
train_df.Sex = train_df.Sex.map({'male':1,'female':0})
test_df.Sex = test_df.Sex.map({'male':1,'female':0})

In [None]:
train_df.Embarked.unique()

In [None]:
### OHE using pandas get_dummies
train_df_  = pd.get_dummies(train_df, columns=['Pclass','Embarked'])
test_df_   = pd.get_dummies(test_df, columns=['Pclass','Embarked'])

In [None]:
train_df_.head()

In [None]:
test_df_.head()

### Bivariate analysis
- Correlation check/multi-collinearity check
- Pair-plot
- Scatter plot
- Outliers handling

In [None]:
### correlation
plt.figure(figsize=(9,5))
sns.heatmap(train_df.drop('PassengerId', axis = 1).corr(), annot=True)

###  - Fare & Pclass are having higher correlation comparitively
  - drop??

#####  what to do if two features are strongly correlated??
  - remove of the features

 Is it good if any feature is strongly correlated with target variable?
 yes

 What if among features A, B, ....Z, A is correlated with B, B is correlated with E, E is correlated with F, 
 F is correlated with A ?
 - multi-collinearity
 - handling through VIF check ; if VIF > 5, remove that feature

In [None]:
train_df_['age_0_18']  = train_df_['Age'].apply(lambda x: 1 if x<=18 else 0 )
train_df_['age_18_40']  = train_df_['Age'].apply(lambda x: 1 if ((x>18) & (x<=40)) else 0 )
train_df_['age_40_60']  = train_df_['Age'].apply(lambda x: 1 if ((x<40) & (x<=60)) else 0 )
train_df_['age_gr_60']  = train_df_['Age'].apply(lambda x: 1 if x>60 else 0 )

In [None]:
test_df_['age_0_18']  = test_df_['Age'].apply(lambda x: 1 if x<=18 else 0 )
test_df_['age_18_40']  = test_df_['Age'].apply(lambda x: 1 if ((x>18) & (x<=40)) else 0 )
test_df_['age_40_60']  = test_df_['Age'].apply(lambda x: 1 if ((x<40) & (x<=60)) else 0 )
test_df_['age_gr_60']  = test_df_['Age'].apply(lambda x: 1 if x>60 else 0 )

In [None]:
train_df_.head()

In [None]:
test_df_.head(5)

In [None]:
from sklearn.linear_model import RidgeClassifierCV, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold

In [None]:
train_df_.head()

In [None]:
test_df_.head()

In [None]:
### segregating features and target 
X = train_df_.drop(['PassengerId','Survived','Age', 'Fare'], axis = 1)
y = train_df_.Survived

X_test_ = test_df_.drop(['PassengerId','Age', 'Fare'], axis = 1)  

In [None]:
print(X.shape)
print(X_test_.shape)

In [None]:
## scaling
scale = StandardScaler()
X_     = scale.fit_transform(X)  
X_test_ = scale.transform(X_test_)  

In [None]:
### split training data into train and validation sets 
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size = 0.2, stratify = y, random_state = 42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier

In [None]:
def find_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5, scoring=scoring)
    return np.mean(xval)

In [None]:
logreg = LogisticRegression()
logreg_cv = LogisticRegressionCV()
rf = RandomForestClassifier()
gboost = GradientBoostingClassifier()

models = [logreg, logreg_cv, rf, gboost]

for model in models:
    print('Cross-validation of : {0}'.format(model.__class__))
    score = find_score(clf=model, X=X_train, y=y_train, scoring='accuracy')
    print('CV score = {0}'.format(score))
    print('****')

In [None]:
model = RandomForestClassifier()
n_estimators = [10, 50, 100, 200]
max_depth = [4, 6, 8, 10]
max_features =  ['sqrt', 'auto', 'log2']
min_samples_split = [2, 3, 5, 10]
min_samples_leaf = [3, 5, 10] 
bootstrap = [True, False]
grid = dict(n_estimators=n_estimators, max_depth=max_depth, max_features=max_features, min_samples_split = min_samples_split, min_samples_leaf=min_samples_leaf, bootstrap=bootstrap)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
random_search = RandomizedSearchCV(estimator=model, param_distributions=grid, n_jobs=-1, cv=cv, scoring='accuracy')
random_result = random_search.fit(X_train,y_train)

In [None]:
print(random_result.best_params_)
print(random_result.best_score_)

In [None]:
train_prediction = random_result.predict(X_train)
test_prediction = random_result.predict(X_test)

In [None]:
###  Model evaluation 
print('AUC on training set: {}'.format((roc_auc_score(y_train, train_prediction))))
print('AUC on validation set: {}'.format(roc_auc_score(y_test, test_prediction)))

In [None]:
prediction = random_result.predict(X_test_)
test_df_['Survived'] = prediction
test_df_[['PassengerId','Survived']].to_csv('submission_rfcv_2.csv', index = False)