# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
from numpy import mean
from numpy import absolute
from numpy import sqrt

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score,mean_squared_error,r2_score

# Importing Datasets

In [3]:
train=pd.read_csv("train_dataset.csv")

In [4]:
test=pd.read_csv("test_dataset.csv")

# Extracting Data

In [5]:
train=train.set_index('Unnamed: 0')
X_train=train.iloc[:,0:9]
y_train=train.Survived

In [6]:
test=test.set_index('Unnamed: 0')
X_test=test.iloc[:,0:9]
y_test=test.Survived

# Taking a look at the data

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(534, 9)
(534,)
(179, 9)
(179,)


In [8]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,FamilyCount,Title,Has Cabin,IsAlone
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
711,1,0,29.699118,26.55,0,0,1,1,0
466,2,0,29.699118,0.0,0,0,1,0,0
213,2,0,30.0,13.0,0,0,1,0,0
496,1,1,54.0,78.2667,2,1,2,1,1
583,1,0,36.0,40.125,2,0,1,1,0


In [9]:
y_train.head()

Unnamed: 0
711    0
466    0
213    0
496    1
583    0
Name: Survived, dtype: int64

# Stochastic Gradient Descent

In [10]:
sgd = SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
acc_sgd = accuracy_score(y_test,y_pred)
acc_sgd

0.7039106145251397

# Hyperparameter Tuning to improve Accuracy

In [13]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
    'max_iter': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], # number of epochs
    'loss': ['hinge','log','modified_huber','squared_hinge','perceptron'],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
}
# Create a based model
sgd = SGDClassifier()

In [21]:
grid_search = GridSearchCV(estimator = sgd, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


GridSearchCV(cv=3, estimator=SGDClassifier(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0,
                                   1000.0],
                         'loss': ['hinge', 'log', 'modified_huber',
                                  'squared_hinge', 'perceptron'],
                         'max_iter': [100, 200, 300, 400, 500, 600, 700, 800,
                                      900, 1000],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none']},
             verbose=2)

In [22]:
grid_search.best_params_

{'alpha': 0.01, 'loss': 'log', 'max_iter': 300, 'penalty': 'l1'}

In [23]:
grid_search.best_score_

0.8314606741573035

In [26]:
sgd = grid_search.best_estimator_

# Cross validating the model

In [27]:
#define cross-validation method to use
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [28]:
scores = cross_val_score(sgd, X_train, y_train, scoring='neg_mean_absolute_error',cv=cv, n_jobs=-1)

#view mean absolute error
mean(absolute(scores))

0.1926275331935709

In [29]:
#define cross-validation method to use
cv = KFold(n_splits=5, random_state=1, shuffle=True) 

#use LOOCV to evaluate model
scores = cross_val_score(sgd, X_train, y_train, scoring='neg_mean_squared_error',cv=cv, n_jobs=-1)

#view RMSE
sqrt(mean(absolute(scores)))

0.4475289232990514

Both MAE and RMSE are quite high meaning our model's performane is not as good

# Pickle the model

In [30]:
# save the model to disc
import pickle
filename = " SGD model.pkl"
pickle.dump(sgd,open(filename,'wb'))