In [4]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [5]:
data = pd.read_csv('datasets/diamonds_processed.csv', index_col = 0)

data.head()

Unnamed: 0_level_0,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,carat,depth,table,x,y,z
clarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
7,13060,0,0,1,0,0,0,0,1,0,0,0,0,0.543531,-0.010463,-0.649443,0.672804,0.748388,0.703836
2,9515,0,0,0,0,1,0,0,0,1,0,0,0,1.504355,0.336188,0.235172,1.326341,1.397924,1.413741
6,945,0,0,1,0,0,0,0,0,1,0,0,0,-0.929733,0.266858,-0.649443,-1.010276,-1.037833,-0.991245
7,918,0,0,1,0,0,0,0,0,1,0,0,0,-1.01514,-0.218453,-1.091751,-1.171422,-1.164132,-1.179588
3,486,0,0,1,0,0,0,0,0,0,1,0,0,-1.057843,0.544179,-1.887904,-1.287805,-1.254345,-1.223051


In [6]:
from sklearn.linear_model import LinearRegression

def linear_model(x_train, y_train):
    
    print('Linear Regression ')
    linear_regression = LinearRegression()
    linear_regression.fit(x_train, y_train)
    
    return linear_regression  

In [7]:
from sklearn.linear_model import Lasso

def lasso_model(x_train, y_train):
    
    print('Lasso Regression ')
    lasso_regression = Lasso(alpha = 0.8, max_iter=10000)
    lasso_regression.fit(x_train, y_train)
    
    return lasso_regression  

In [8]:
from sklearn.linear_model import Ridge

def ridge_model(x_train, y_train):
    
    print('Ridge Regression ')
    ridge_regression = Ridge(alpha = 0.9)
    ridge_regression.fit(x_train, y_train)
    
    return ridge_regression  

In [9]:
def build_and_train_model(data, target_name, reg_fn):
    
    x = data.drop(target_name, axis=1)
    y = data[target_name]
    
    x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)
    
    model = reg_fn(x_train, y_train)
    
    score = model.score(x_train, y_train)
    print('Training score: ',score)
    
    y_pred = model.predict(x_test)
    r_score = r2_score(y_test, y_pred)
    print("Testing score: ", r_score)
    
    return {
        'model': model,
        'x_train': x_train, 'x_test': x_test,
        'y_train': y_train, 'y_test': y_test,
        'y_pred': y_pred
    }
    

In [10]:
linear_reg = build_and_train_model(data, "price", linear_model)

Linear Regression 
Training score:  0.8851728566150202
Testing score:  0.8912331731687105


In [11]:
lasso_reg = build_and_train_model(data, "price", lasso_model)

Lasso Regression 
Training score:  0.88512163460472
Testing score:  0.8909577607451442


In [12]:
ridge_reg = build_and_train_model(data, "price", ridge_model)

Ridge Regression 
Training score:  0.885186798316436
Testing score:  0.8911688255480525


In [13]:
linear_reg['model']

LinearRegression()

In [14]:
# regression using sgd

from sklearn.linear_model import SGDRegressor

def sgd_model(x_train, y_train):
    print('SGD regression')
    sgd_regression = SGDRegressor(max_iter = 2000)
    
    sgd_regression.fit(x_train, y_train)
    
    return sgd_regression

In [15]:
sgd_reg = build_and_train_model(data, "price", sgd_model)

SGD regression
Training score:  0.8843091342520767
Testing score:  0.8898640071396412
