In [1]:
import pandas as pd

### Model Training

In [2]:
df = pd.read_csv('data/gemstone.csv')

In [3]:
df = df.drop('id', axis=1)

In [5]:
## Independent and dependent features
X = df.drop('price', axis=1)
y = df[['price']]

In [8]:
num_cols = X.select_dtypes(include='number').columns
cat_cols = X.select_dtypes(include='object').columns

In [25]:
cat_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [44]:
cut_categories = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [13]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [14]:
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]
)

In [45]:
cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder', OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories])),
        ('scaler', StandardScaler())
    ]
)

In [46]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, num_cols),
        ('cat_pipeline', cat_pipeline, cat_cols),
    ]
)

In [47]:
preprocessor

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [28]:
X_train.shape, y_train.shape

((135501, 9), (135501, 1))

In [29]:
X_test.shape, y_test.shape

((58072, 9), (58072, 1))

In [49]:
preprocessor.fit_transform(X_train)

array([[-0.82314374, -1.12998781, -0.64189666, ...,  0.87410007,
        -0.93674681,  1.35074594],
       [ 0.94502267, -1.77782269,  0.92190185, ..., -1.13764403,
         0.91085333,  0.68445511],
       [ 1.9584839 ,  0.16568195,  0.40063568, ..., -0.13177198,
         0.91085333,  0.01816428],
       ...,
       [ 0.92345966,  0.90606467,  0.40063568, ..., -0.13177198,
         0.29498662,  0.01816428],
       [-1.03877378, -0.66724861, -0.64189666, ..., -1.13764403,
         0.29498662,  2.01703677],
       [-1.03877378, -0.01941373,  0.92190185, ..., -1.13764403,
         0.29498662, -1.31441737]])

In [51]:
X_train_pre = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test_pre = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [52]:
X_train_pre

Unnamed: 0,num_pipeline__carat,num_pipeline__depth,num_pipeline__table,num_pipeline__x,num_pipeline__y,num_pipeline__z,cat_pipeline__cut,cat_pipeline__color,cat_pipeline__clarity
0,-0.823144,-1.129988,-0.641897,-0.780451,-0.835103,-0.876024,0.874100,-0.936747,1.350746
1,0.945023,-1.777823,0.921902,1.073226,1.166389,0.946633,-1.137644,0.910853,0.684455
2,1.958484,0.165682,0.400636,1.703116,1.755063,1.742237,-0.131772,0.910853,0.018164
3,-0.995648,-0.574701,-0.641897,-1.122391,-1.161138,-1.165334,0.874100,-0.320880,2.017037
4,-0.995648,0.258230,0.400636,-1.176382,-1.152082,-1.136403,-1.137644,1.526720,-0.648127
...,...,...,...,...,...,...,...,...,...
135496,-0.629077,-1.500179,1.964434,-0.546492,-0.518125,-0.644575,-1.137644,-0.936747,-0.648127
135497,2.411307,0.443325,2.485700,1.919078,1.872797,1.930288,-1.137644,-0.320880,-0.648127
135498,0.923460,0.906065,0.400636,0.992240,0.921862,1.047891,-0.131772,0.294987,0.018164
135499,-1.038774,-0.667249,-0.641897,-1.212375,-1.197364,-1.252127,-1.137644,0.294987,2.017037


In [53]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [56]:
regression = LinearRegression() 
regression.fit(X_train_pre, y_train)

In [57]:
regression.coef_

array([[ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
         -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
          652.10059539]])

In [58]:
regression.intercept_

array([3976.8787389])

In [60]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = mean_squared_error(true, predicted, squared=False)
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

### Train multiple models

In [80]:
models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'ElasticNet': ElasticNet(),
}

model_list = []
trained_model_list = []
r2_list = []


for model in models.keys():
    models[model].fit(X_train_pre, y_train)

    y_pred = models[model].predict(X_train_pre)

    model_list.append(model)
    trained_model_list.append(models[model])

    mae, mse, rmse, r2_square = evaluate_model(y_train, y_pred)

    print(model)
    print('Model Training performance')
    print('MAE : ', mae)
    print('MSE : ', mse)
    print('RMSE : ', rmse)
    print('R2_square : ', r2_square*100)

    r2_list.append(r2_square)

    print('='*35)
    print('\n')


LinearRegression
Model Training performance
MAE :  676.2365633491881
MSE :  1030237.2102839048
RMSE :  1015.0060149003575
R2_square :  93.68963478144956


Lasso
Model Training performance
MAE :  677.3140714902532
MSE :  1030498.6090540952
RMSE :  1015.1347738374916
R2_square :  93.68803367280087


Ridge
Model Training performance
MAE :  676.2681278690862
MSE :  1030237.3485450144
RMSE :  1015.0060830088726
R2_square :  93.68963393457848


ElasticNet
Model Training performance
MAE :  1066.523086670448
MSE :  2369280.2040170627
RMSE :  1539.2466352138188
R2_square :  85.48778548941242




In [81]:
model_list

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']

In [82]:
trained_model_list

[LinearRegression(), Lasso(), Ridge(), ElasticNet()]

In [3]:
import os
from datetime import datetime

In [2]:
os.getcwd()

'c:\\Users\\hp\\Desktop\\ml_project\\notebooks'

In [6]:
os.path.join(os.getcwd(), "logs")

'c:\\Users\\hp\\Desktop\\ml_project\\notebooks\\logs'

In [17]:
LOG_FILE = f"{datetime.now().strftime('%m-%d-%Y %H:%M:%S')}.log"

In [19]:
logs_path = os.path.join(os.getcwd(),"logs",LOG_FILE)

In [20]:
os.path.join(logs_path, LOG_FILE)

'c:\\Users\\hp\\Desktop\\ml_project\\notebooks\\logs\\10-31-2023 19:32:05.log\\10-31-2023 19:32:05.log'