# Importing Libraries

In [4]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt
from sklearn.metrics import accuracy_score,mean_squared_error,r2_score

# Importing Datasets

In [5]:
train=pd.read_csv("train_dataset.csv")

In [6]:
test=pd.read_csv("test_dataset.csv")

# Extracting values

In [7]:
train.shape

(534, 11)

In [8]:
train.columns

Index(['Unnamed: 0', 'Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilyCount',
       'Title', 'Has Cabin', 'IsAlone', 'Survived'],
      dtype='object')

In [9]:
train=train.set_index('Unnamed: 0')
train.shape

(534, 10)

In [10]:
X_train=train.iloc[:,0:9]
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,FamilyCount,Title,Has Cabin,IsAlone
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
711,1,0,29.699118,26.55,0,0,1,1,0
466,2,0,29.699118,0.0,0,0,1,0,0
213,2,0,30.0,13.0,0,0,1,0,0
496,1,1,54.0,78.2667,2,1,2,1,1
583,1,0,36.0,40.125,2,0,1,1,0


In [11]:
y_train=train.Survived

In [12]:
test=test.set_index('Unnamed: 0')

In [13]:
X_test=test.iloc[:,0:9]
y_test=test.Survived

# LogisticRegression Model

In [14]:
lr=LogisticRegression(max_iter=200)
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
acc_log =accuracy_score(y_test,y_pred)
acc_log


0.7932960893854749

# Hyperparameter Tuning to improve Accuracy

In [20]:
import warnings
warnings.filterwarnings('ignore')

In [21]:
from sklearn.model_selection import GridSearchCV
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

In [22]:
LRModel_grid = GridSearchCV(LogisticRegression(), param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)
g_res = LRModel_grid.fit(X_train, y_train)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


In [23]:
g_res.best_params_

{'C': 0.08858667904100823,
 'max_iter': 100,
 'penalty': 'l2',
 'solver': 'liblinear'}

In [24]:
g_res.best_score_

0.8426966292134832

In [30]:
lr = g_res.best_estimator_

# Using Cross validation to check performance of the model

In [27]:
#define cross-validation method to use
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [32]:
scores = cross_val_score(lr, X_train, y_train, scoring='neg_mean_absolute_error',cv=cv, n_jobs=-1)

#view mean absolute error
mean(absolute(scores))

0.15698812019566735

In [33]:
#define cross-validation method to use
cv = KFold(n_splits=5, random_state=1, shuffle=True) 

#use LOOCV to evaluate model
scores = cross_val_score(lr, X_train, y_train, scoring='neg_mean_squared_error',cv=cv, n_jobs=-1)

#view RMSE
sqrt(mean(absolute(scores)))

0.3941916500338954

Both MAE and RMSE are quite low meaning our model's performane is quite good

# Pickle the model

In [35]:
# save the model to disc
import pickle

In [36]:
filename = "LogisticRegression model.pkl"
pickle.dump(lr,open(filename,'wb'))