## Implementation of Gradient Descent algorithm with Ridge Regression for shrinkage the coefficient

In [6]:
import os
import math
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from statsmodels.regression.linear_model import OLS
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split

In [12]:
print(os.listdir('datasets'))

['glass.csv', 'dataset_4.txt', 'dataset_3.txt', 'dataset_2.txt', 'dataset_1.txt', 'ign.csv', 'spam.csv']


In [43]:
data = pd.read_csv('datasets/dataset_4.txt')
df_x = data.iloc[:, 50:65]
df_y = data.iloc[:,-1]

print ('Shape before expansion: ', df_x.shape)
num_predictors = np.shape(df_x)[1]
data_expanded = pd.DataFrame({})

# Loop
for column in df_x.columns:
    # For categorical variables, is the number of unique items < 20 or data type is an object
    if (len(df_x[column].unique()) < 20 or (df_x[column].dtype == np.dtype('object'))):
        encoded_col = pd.get_dummies(df_x[column])
        data_expanded = pd.concat([data_expanded, encoded_col], axis=1) 
    else:
        data_expanded = pd.concat([data_expanded, df_x[column]], axis=1) 
        
data_expanded.head()

Shape before expansion:  (2517, 15)


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,...,0.0,1.0,2.0,3.0,4.0,5.0,9.0,10.0,11.0,ETH7
0,2707.0,672.0,929.0,99.0,0.0,0.0,45.0,55.0,97.0,1.0,...,1,0,0,0,0,0,0,0,0,0.0
1,2147.0,591.0,640.0,99.0,0.0,0.0,49.0,51.0,94.0,2.0,...,0,1,0,0,0,0,0,0,0,0.0
2,2160.0,683.0,900.0,89.0,0.0,11.0,48.0,52.0,99.0,0.0,...,1,0,0,0,0,0,0,0,0,0.0
3,13801.0,3736.0,6388.0,99.0,0.0,0.0,48.0,52.0,97.0,0.0,...,1,0,0,0,0,0,0,0,0,1.0
4,1673.0,418.0,462.0,99.0,0.0,0.0,49.0,51.0,7.0,93.0,...,1,0,0,0,0,0,0,0,0,0.0


In [44]:
df_y.head()

0     4.0
1     7.0
2     5.0
3    13.0
4    10.0
Name: TARGET_D, dtype: float64

In [45]:
x = data_expanded.iloc[:, 0:-1].values
y = df_y.values.reshape(len(df_y.values), 1)

# Split into train and test set
n = x.shape[0]
n_train = int(np.round(n*0.75))
indices = np.random.permutation(range(n))
train_indices = indices[0:n_train]
test_indices = indices[n_train:]

# The first 25% are in the training set, the rest is in the test set (assuming the data has been ran. shuffeled) 
X_train = x[train_indices,:]
y_train = y[train_indices]
X_test = x[test_indices,:]
y_test = y[test_indices]

In [46]:
class GradientDescent:
    def __init__(self):
        pass
    
    def fit(self, X, y, learning_rate=0.0001, iters = 1000):
        self.X = X
        self.y = y
        self.learning_rate = learning_rate
        self.iters = iters
        self.m, self.b, self.error = linear_regression()
        
    def linear_regression(self):
        N = float(len(self.y))
        m, b = 0, 0
        for i in range(self.iters):
            y_temp = (m * self.X) + b
            y_diff = self.y - y_temp
            MSE = sum([error**2 for error in y_diff]) / N
            m_gradient = -(2/N) * sum(self.X * y_diff)
            b_gradient = -(2/N) * sum(y_diff)
            m -= (self.learning_rate * m_gradient)
            b -= (self.learning_rate * b_gradient)
        return m, b, MSE

    def score(self, X_test, y_test):
        self.X_test = X_test
        self.y_test = y_test
        y = self.m*self.X_test + self.b
        return sum((error**2 for error in (self.y - y))) / float(len(self.y))

In [47]:
class RidgeRegression:
    def __init__(self):
        pass
    
    def fit(self, X, y, alpha=1.0):
        self.X = X
        self.y = y
        self.alpha = alpha
        self.model = self.ridge()
        
    def ridge(self):
        n = np.shape(self.X)
        self.X = np.concatenate((self.X, self.alpha * np.identity(n)), axis=1)
#         n, p = np.shape(self.X)
#         self.X = np.concatenate((self.X, np.sqrt(10.0**self.alpha) * np.identity(p)), axis=0)
        print('self.X.new', self.X)
        y_temp = np.zeros((n + np.shape(self.X)[1], 1))
#         self.y = np.concatenate((self.y, np.zeros(p)), axis=0)
        print('y.temp', y_temp)
        for c in range(n):
            y_temp[c] = self.y[c]
        self.model = LinearRegression() # GradientDescent()
        y_reshape = self.y.reshape(-1, 1)
        print('y.reshape', y.reshape)
        self.model.fit(self.X, self.y.reshape(-1, 1))
#         self.model.fit(self.X, self.y)
        return self.model
        
    def score(self, X_test, y_test):
        self.X_test = X_test
        self.y_test = y_test
        n = np.shape(self.X_test)[0]
        self.X_test = np.concatenate((self.X_test, self.alpha * np.identity(n)), axis=1)
#         n, p = np.shape(self.X_test)
#         self.X_test = np.concatenate((self.X_test, np.sqrt(10.0**self.alpha) * np.identity(p)), axis=0)
        y_temp = np.zeros((n + np.shape(self.X_test)[1], 1))
#         self.y_test = np.concatenate((y_test, np.zeros(p)), axis=0)
        for c in range(n):
            y_temp[c] = self.y_test[c]
        return self.model.score(self.X_test, self.y_test.reshape(-1, 1))
#         return self.model.score(self.X_test, self.y_test)

In [49]:
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_train, y_train)
train_rsquared = ridge_reg.score(X_train, y_train)
test_rsquared = ridge_reg.score(X_test, Y_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').