# Model Training

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./data/gemstone.csv")

In [3]:
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
df = df.drop(labels=["id"], axis=1)

In [6]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [7]:
# Independent and dependent fetaures
X = df.drop(labels=["price"], axis=1)
y = df[["price"]]

In [18]:
# Categorical and numerical features

categorical_cols = X.select_dtypes(include="object").columns
numerical_cols = X.select_dtypes(exclude = "object").columns

In [19]:
# Define the custom ranking for each ordinal variable
cut_categories = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_categories = ["J", "I", "H", "G", "F", "E", "D"]
clarity_categories = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

In [20]:
# Automate Feature Engineering 

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

# Pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [21]:
# Build Pipelines

num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=(
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinalencoder", OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ("scaler", StandardScaler())
    )
)


# Combine both

preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, numerical_cols),
    ("cat_pipeline", cat_pipeline, categorical_cols)
])

In [22]:
preprocessor

In [23]:
# train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [26]:
preprocessor.fit_transform(X_train)

array([[-0.82314374, -1.12998781, -0.64189666, ...,  0.87410007,
         0.93674681,  1.35074594],
       [ 0.94502267, -1.77782269,  0.92190185, ..., -1.13764403,
        -0.91085333,  0.68445511],
       [ 1.9584839 ,  0.16568195,  0.40063568, ..., -0.13177198,
        -0.91085333,  0.01816428],
       ...,
       [ 0.92345966,  0.90606467,  0.40063568, ..., -0.13177198,
        -0.29498662,  0.01816428],
       [-1.03877378, -0.66724861, -0.64189666, ..., -1.13764403,
        -0.29498662,  2.01703677],
       [-1.03877378, -0.01941373,  0.92190185, ..., -1.13764403,
        -0.29498662, -1.31441737]])

In [28]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [29]:
X_train.head(2)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.8741,0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,-0.910853,0.684455


In [30]:
X_test.head(2)

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.629077,0.25823,-0.12063,-0.600482,-0.581521,-0.572248,0.8741,1.552614,-0.648127
1,2.605374,-2.148014,-0.12063,2.126042,2.198832,1.959219,-1.137644,-0.294987,-1.314417


In [31]:
# Model Training

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [32]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [33]:
regression.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,   464.67990411,
          652.10059539]])

In [34]:
regression.intercept_

array([3976.8787389])

In [35]:
# Model evaluation

import numpy as np
def evaluate_model(true, predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [37]:
models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "ElasticNet" : ElasticNet()
}

model_list = []
r2_list = []


for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)


    # Make prediction
    y_pred = model.predict(X_test)

    mae, rmse, r2_square = evaluate_model(y_test, y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print('RMSE : ',rmse)
    print('MAE : ',mae)
    print('R2 SCORE : ', r2_square*100)

    r2_list.append(r2_square)

    print("="*35)
    print("\n")

LinearRegression
Model Training Performance
RMSE :  1014.6296630375463
MAE :  675.0758270067483
R2 SCORE :  93.62906819996049


Lasso
Model Training Performance
RMSE :  1014.659130275064
MAE :  676.2421173665508
R2 SCORE :  93.62869814082755


Ridge
Model Training Performance
RMSE :  1014.6343233534411
MAE :  675.1077629781329
R2 SCORE :  93.62900967491632


ElasticNet
Model Training Performance
RMSE :  1533.3541245902313
MAE :  1060.9432977143008
R2 SCORE :  85.44967219374031




In [38]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']

In [39]:
r2_list

[0.9362906819996049,
 0.9362869814082755,
 0.9362900967491632,
 0.8544967219374031]

In [40]:
r2_list.sort()

In [41]:
r2_list

[0.8544967219374031,
 0.9362869814082755,
 0.9362900967491632,
 0.9362906819996049]