## Model Training


In [1]:
import pandas as pd
ds = pd.read_csv('./data/gemstone.csv')
ds.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [2]:
# Sorting data for model training
x = ds.drop(labels=['id','price'],axis=1)
y = ds['price']

# Separating Numericals and Categoricals Columns
num_col = x.select_dtypes(include='number').columns
cat_col = x.select_dtypes(exclude='number').columns
print(f'Numerical columns are {num_col}\nCategorical columns are {cat_col}')

# Defining ranking of categorical columns
Rcut=["Fair","Good","Very Good","Premium","Ideal"]
Rclarity = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]
Rcolor = ["D", "E", "F", "G", "H", "I", "J"]
x.head()

Numerical columns are Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')
Categorical columns are Index(['cut', 'color', 'clarity'], dtype='object')


Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77


#### Pipeline

In [3]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

# Numerical Pipeline
Pnum = Pipeline(steps=[('Imputer',SimpleImputer(strategy='median')),('Scaler',StandardScaler())])
# Categorical Pipeline
Pcat = Pipeline(steps=[('Imputer',SimpleImputer(strategy='most_frequent')),
                       ('Ordinal Encoder',OrdinalEncoder(categories=[Rcut,Rcolor,Rclarity])),
                       ('Scaler',StandardScaler())])
com = ColumnTransformer(transformers=[('Num Pipeline',Pnum,num_col), ('Cat Pipeline',Pcat,cat_col)])

# Train Test Split and transforming
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.35, random_state=72)
print(x_train.dtypes)
x_train = pd.DataFrame(com.fit_transform(x_train), columns=com.get_feature_names_out())
x_test = pd.DataFrame(com.transform(x_test), columns=com.get_feature_names_out())
x_train.head()

carat      float64
cut         object
color       object
clarity     object
depth      float64
table      float64
x          float64
y          float64
z          float64
dtype: object


Unnamed: 0,Num Pipeline__carat,Num Pipeline__depth,Num Pipeline__table,Num Pipeline__x,Num Pipeline__y,Num Pipeline__z,Cat Pipeline__cut,Cat Pipeline__color,Cat Pipeline__clarity
0,0.450875,-1.779633,0.92727,0.70568,0.660597,0.502496,-0.133678,0.301044,-0.648087
1,1.704848,0.349266,-1.163906,1.571437,1.549985,1.596352,0.872705,0.301044,0.685142
2,-0.565276,0.441827,-0.641112,-0.45768,-0.4466,-0.416343,0.872705,-0.314768,0.018527
3,0.775178,-0.020977,0.404476,0.904083,0.887481,0.896284,-0.133678,-1.546391,0.018527
4,-1.062541,0.256705,-0.641112,-1.242272,-1.281536,-1.247673,0.872705,-0.93058,1.351756


#### Model Training

In [4]:
# For Evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def evaluate (true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    r2 = r2_score(true, predicted)
    return mae, mse, r2

In [5]:
# Training with Different Techniques
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet

models = {'LR':LinearRegression(),
          'LR with lasso':Lasso(),
          'LR with ridge':Ridge(),
          'LR with EN':ElasticNet(),
          'Random Forest':RandomForestRegressor(),
          'SVM':SVR(),
          'Gradient Boosting':GradientBoostingRegressor(),
          'xgb':XGBRegressor(),
          'lgb':LGBMRegressor(),
          'dt':DecisionTreeRegressor(),
          'knn':KNeighborsRegressor(n_neighbors=5)}

trained_models = []
model_list = []
r2_list = []

for model_name, model in models.items():
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    mae, mse, r2 = evaluate(y_test, y_pred)

    print(model_name)
    model_list.append(model_name)

    print('Model Performance\n', f'MSE {mse}\n', f'MAE {mae}\n', f'R2 Score {r2*100}')
    r2_list.append(r2)
    print('='*50, '\n')

LR
Model Performance
 MSE 1039943.0547572924
 MAE 675.7579660838348
 R2 Score 93.61560123663088

LR with lasso
Model Performance
 MSE 1035606.4699096036
 MAE 676.36431205316
 R2 Score 93.64222431643528

LR with ridge
Model Performance
 MSE 1039933.8959274681
 MAE 675.7809183892473
 R2 Score 93.61565746434599

LR with EN
Model Performance
 MSE 2367267.9724820615
 MAE 1064.1135042096678
 R2 Score 85.46691316708193

Random Forest
Model Performance
 MSE 371342.41821568384
 MAE 311.8963779206211
 R2 Score 97.72026163856057

SVM
Model Performance
 MSE 3912878.2657386116
 MAE 877.0793463545724
 R2 Score 75.97813164219285

Gradient Boosting
Model Performance
 MSE 375104.0822075823
 MAE 331.79539615701293
 R2 Score 97.69716810201717



  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


xgb
Model Performance
 MSE 340929.9415154402
 MAE 298.8505614161433
 R2 Score 97.9069693412062

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003736 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1223
[LightGBM] [Info] Number of data points in the train set: 125822, number of used features: 9
[LightGBM] [Info] Start training from score 3974.428089
lgb
Model Performance
 MSE 328711.1117911174
 MAE 298.0531426449982
 R2 Score 97.98198294990807

dt
Model Performance
 MSE 714399.8384792189
 MAE 425.4401583248464
 R2 Score 95.614170002412

knn
Model Performance
 MSE 453303.53003143874
 MAE 351.84991808238993
 R2 Score 97.21708752866375

