In [2]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [5]:
data = pd.read_csv('datasets/diamonds_processed.csv', index_col=0)
data.head(4)

Unnamed: 0_level_0,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,carat,depth,table,x,y,z
clarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3,1668,0,0,0,1,0,0,0,0,1,0,0,0,-0.517691,0.380171,-0.650849,-0.389734,-0.438761,-0.369501
3,825,0,0,1,0,0,0,0,0,1,0,0,0,-0.832918,-0.615218,0.247402,-0.863097,-0.843331,-0.902957
5,2271,0,0,1,0,0,0,1,0,0,0,0,0,-0.559721,0.806767,-1.549101,-0.479048,-0.519675,-0.412755
4,6499,0,0,0,1,0,0,0,0,1,0,0,0,0.449007,0.593469,0.696528,0.583787,0.658074,0.69741


In [6]:
from sklearn.linear_model import LinearRegression

def linear_model(x_train, y_train):
    
    print("Linear Regression ")
    linear_regression = LinearRegression()
    linear_regression.fit(x_train, y_train)
    
    return linear_regression

In [8]:
from sklearn.linear_model import Lasso

def lasso_model(x_train, y_train):
    print("Lasso Regression")
    lasso_regression = Lasso(alpha=0.8, max_iter=10000)
    lasso_regression.fit(x_train, y_train)
    
    return lasso_regression

In [9]:
from sklearn.linear_model import Ridge

def ridge_model(x_train, y_train):
    print("Ridge Regression")
    ridge_regression = Ridge(alpha=0.9)
    ridge_regression.fit(x_train,y_train)
    
    return ridge_regression

In [12]:
def build_and_train_model(data, target_name, reg_fnc):
    X = data.drop(target_name, axis=1)
    Y = data[target_name]
    
    x_train, x_test, y_train, y_test = \
        train_test_split(X, Y, test_size=0.2, random_state=0)
    
    model = reg_fnc(x_train, y_train)
    
    score = model.score(x_train, y_train)
    print("Training Score is : ", score)
    
    y_pred = model.predict(x_test)
    r_score = r2_score(y_test, y_pred)
    print("Testing Score is : ", r_score)
    
    return {'model' : model,
           'x_train' : x_train, 'x_test' : x_test,
           'y_train' : y_train, 'y_test' : y_test,
           'y_pred' : y_pred
           }


In [13]:
linear_reg = build_and_train_model(data, "price", linear_model)

Linear Regression 
Training Score is :  0.8773658172060331
Testing Score is :  0.8922238968248872


In [16]:
lasso_reg = build_and_train_model(data, "price", lasso_model)

Lasso Regression
Training Score is :  0.8773016091090944
Testing Score is :  0.8920602307581188


In [17]:
ridge_reg = build_and_train_model(data, "price", ridge_model)

Ridge Regression
Training Score is :  0.8773629334622932
Testing Score is :  0.8921929040094341


In [18]:
linear_reg['model']

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [20]:
from sklearn.linear_model import SGDRegressor

def sgd_model(x_train, y_train):
    
    print("Stochastic Gradient Descent Regression")
    sgd_regression = SGDRegressor(max_iter=2000)
    sgd_regression.fit(x_train, y_train)
    
    return sgd_regression

In [21]:
sgd_reg = build_and_train_model(data, "price", sgd_model)

Stochastic Gradient Descent Regression
Training Score is :  0.8763871983682885
Testing Score is :  0.8906955105884247
