In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
data = pd.read_csv(filepath_or_buffer='https://raw.githubusercontent.com/insaid2018/Term-1/master/Data/Casestudy/titanic_train.csv')
data.head()

In [None]:
data['Embarked'] = data['Embarked'].fillna(value=data['Embarked'].mode()[0])
data['Age'] = data['Age'].fillna(value=data['Age'].median())
data.drop(labels='Cabin',axis=1,inplace=True)

In [None]:
data.info()

In [None]:
data.head()

In [None]:
X = data.drop(labels=['PassengerId','Survived','Name','Ticket'],axis=1)
X.head()

In [None]:
y = data['Survived']
y.head()

In [None]:
X = pd.get_dummies(data=X,columns=['Sex','Embarked'])
X.head()

In [None]:
scaler = StandardScaler()

scaled = scaler.fit_transform(data[['Age','Fare']])
data2 = pd.DataFrame(data=scaled,columns=['Age','Fare'])
data2.head()

In [None]:
X.drop(labels=['Age','Fare'],axis=1,inplace=True)

X1 = pd.concat(objs=[X,data2],axis=1)
X1.head(20)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X1,y,test_size=0.20,random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

param_dist = {"max_depth": range(2,5),
              "min_samples_split": sp_randint(2, 11),
              "min_samples_leaf": sp_randint(1, 11),
              "bootstrap": [True, False],
              "n_estimators": [100, 400, 700, 1000, 1500],
              "criterion" : ["gini", "entropy"],
              'max_features': ['sqrt', 'log2', None]
             }




In [None]:
logreg = LogisticRegression()
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier(n_estimators=400,max_depth=5,criterion='gini',class_weight='balanced')

logreg.fit(X_train,y_train)
dtc.fit(X_train,y_train)
rfc.fit(X_train,y_train)
#random_seach.fit(X_train,y_train)

In [None]:
rfc.feature_importances_

In [None]:
y_log_pred_train = logreg.predict(X_train)
y_log_pred_test = logreg.predict(X_test)

y_dtc_pred_train = dtc.predict(X_train)
y_dtc_pred_test = dtc.predict(X_test)

y_rfc_pred_train = rfc.predict(X_train)
y_rfc_pred_test = rfc.predict(X_test)

#y_random_pred_train = random_seach.predict(X_train)
#y_random_pred_test = random_seach.predict(X_test)

In [None]:
from sklearn.metrics import plot_confusion_matrix

In [None]:
cm = pd.DataFrame(confusion_matrix(y_train,y_log_pred_train))
cm.columns()
print(cm)

In [None]:
fig,(ax1) = plt.subplots(nrows=4,ncols=2,sharex=False,figsize=(15,7))

plot_confusion_matrix(estimator=logreg,X=X_train,y_true=y_train,cmap='viridis',ax=ax1[0,0])
plot_confusion_matrix(estimator=logreg,X=X_test,y_true=y_test,cmap='viridis',ax=ax1[0,1])

plot_confusion_matrix(estimator=dtc,X=X_train,y_true=y_train,cmap='viridis',ax=ax1[1,0])
plot_confusion_matrix(estimator=dtc,X=X_test,y_true=y_test,cmap='viridis',ax=ax1[1,1])

plot_confusion_matrix(estimator=rfc,X=X_train,y_true=y_train,cmap='viridis',ax=ax1[2,0])
plot_confusion_matrix(estimator=rfc,X=X_test,y_true=y_test,cmap='viridis',ax=ax1[2,1])


plot_confusion_matrix(estimator=random_seach,X=X_train,y_true=y_train,cmap='viridis',ax=ax1[3,0])
plot_confusion_matrix(estimator=random_seach,X=X_test,y_true=y_test,cmap='viridis',ax=ax1[3,1])


In [None]:
train_report = classification_report(y_train, y_log_pred_train)
test_report = classification_report(y_test, y_log_pred_test)
print('                    Training Report          ')
print(train_report)
print('                    Testing Report           ')
print(test_report)

In [None]:
train_report = classification_report(y_train, y_dtc_pred_train)
test_report = classification_report(y_test, y_dtc_pred_test)
print('                    Training Report          ')
print(train_report)
print('                    Testing Report           ')
print(test_report)

In [None]:
train_report = classification_report(y_train, y_rfc_pred_train)
test_report = classification_report(y_test, y_rfc_pred_test)
print('                    Training Report          ')
print(train_report)
print('                    Testing Report           ')
print(test_report)

In [None]:
train_report = classification_report(y_train, y_random_pred_train)
test_report = classification_report(y_test, y_random_pred_test)
print('                    Training Report          ')
print(train_report)
print('                    Testing Report           ')
print(test_report)

In [None]:
data.head()

In [None]:
data['Title'] = data['Name'].str.extract(pat=' ([A-Za-z]+)\.',expand=False)

In [None]:
data.head()

In [None]:
data['Title'].unique()

In [None]:
data['Title'].replace(to_replace = ['Mr','Mrs','Miss','Master','Don','Rev','Dr','Mme','Ms','Major','Lady','Sir','Mlle','Col','Capt','Countess','Jonkheer'],
                    value =['Mr','Mrs','Miss','Master','Mr','Mr','Dr','Miss','Miss','Mr','Mrs','Mr','Miss','Mr','Mr','Miss','Master'],inplace=True)

In [None]:
data['Title'].unique()

In [None]:
data[data['Title'] == 'Dr'][['Name','Sex','Age']]

In [None]:
data['Title1'] = data['Name'].str.extract(pat=' ([A-Za-z]+)\.',expand=False)
data.head()

In [None]:
data.drop(labels='Title1',axis=1,inplace=True)
data.head()