### Model training

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("cubic_zirconia.csv")
df.head(3)

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,2,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984
2,3,0.9,Very Good,E,VVS2,62.2,60.0,6.04,6.12,3.78,6289


In [3]:
df = df.drop("Unnamed: 0", axis=1)

In [4]:
df.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,0.3,Ideal,E,SI1,62.1,58.0,4.27,4.29,2.66,499
1,0.33,Premium,G,IF,60.8,58.0,4.42,4.46,2.7,984


In [5]:
## Independent and dependent features

X = df.drop("price", axis=1)
y = df["price"]

In [6]:
# Define which column should be ordinal-encoded and which should be scaled

categorical_feature = X.select_dtypes(include="object").columns
numerical_feature = X.select_dtypes(exclude="object").columns

In [7]:
categorical_feature

Index(['cut', 'color', 'clarity'], dtype='object')

In [8]:
cut_categories = ["Fair", "Good", "Very Good", "Premium", "Ideal"]
color_categories = ["D", "E", "F", "G", "H", "I", "J"]
clarity_categories = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS1", "VVS2", "IF"]

### Pipeline creation

In [9]:
from sklearn.impute import SimpleImputer ## For handling missing values
from sklearn.preprocessing import StandardScaler ## For handling feature scaling
from sklearn.preprocessing import OrdinalEncoder ## For Categorical Ordinal Encoding

# Pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [10]:
## Numerical Pipeline

numerical_pipeline = Pipeline(
    steps = [("imputer",SimpleImputer(strategy="median")),
             ("scaler", StandardScaler())]
)


## Categorical Pipeline

categorical_pipeline = Pipeline(
    steps= [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal_encoder", (OrdinalEncoder(categories=[cut_categories, color_categories, clarity_categories]))),
        ("scaler", StandardScaler())
    ]
)

In [11]:
preprocessor = ColumnTransformer([
    ("numerical_pipeline", numerical_pipeline, numerical_feature),
    ("categorical_pipeline", categorical_pipeline, categorical_feature)
])

### Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=20)

In [13]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns = preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [14]:
X_train.head(2)

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,0.41915,-1.251625,2.036898,0.635523,0.629143,0.467346,-0.828347,0.223394,-1.236431
1,0.690926,-0.820594,0.244281,0.954662,0.832461,0.769698,0.072466,0.808115,-0.050149


In [15]:
X_test.head(2)

Unnamed: 0,numerical_pipeline__carat,numerical_pipeline__depth,numerical_pipeline__table,numerical_pipeline__x,numerical_pipeline__y,numerical_pipeline__z,categorical_pipeline__cut,categorical_pipeline__color,categorical_pipeline__clarity
0,1.903466,1.262725,-0.652027,1.637266,1.501719,1.717981,-0.828347,-1.53077,-1.236431
1,2.676982,0.616178,0.692435,2.12484,2.001544,2.116535,0.072466,0.808115,-0.64329


### Model Training

In [16]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [21]:
regress = LinearRegression()

model = regress.fit(X_train, y_train)

In [22]:
regress.coef_

array([ 5205.77118173,  -113.32491185,   -71.15310183, -1034.22228376,
          18.38728851,   -12.24665064,   128.85539149,  -559.92880478,
         815.30157511])

In [23]:
regress.intercept_

3953.238368355995

In [24]:
regress.score(X_test, y_test)

0.9071631629908168

In [28]:
### check for the metrics score
import numpy as np
def evaluate_model(actual, pred):
    mse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    score = r2_score(y_test, y_pred)

    return score, mae, mse

y_pred = model.predict(X_test)

evaluate_model(y_test, y_pred)

(0.9071631629908168, 805.6305993140015, 1216.9983421766688)

In [32]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge": Ridge(),
    "ElasticNet" : ElasticNet()
}

model_list = []
r2_score_list = []
for i in models:
    model = models[i]
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    #evaluate model

    score, mae, mse = evaluate_model(y_test, y_pred)

    print(i)
    model_list.append(i)
    r2_score_list.append(score)

    print("Model Evaluation Score")
    print("PMSE", score)
    print("Mean Absolute Error", mae)
    print("Mean Squared Error", mse)

    print("="* 30)
    print("\n")

LinearRegression
Model Evaluation Score
PMSE 0.9071631629908168
Mean Absolute Error 805.6305993140015
Mean Squared Error 1216.9983421766688


Lasso
Model Evaluation Score
PMSE 0.9071414796792513
Mean Absolute Error 807.0188172346014
Mean Squared Error 1217.1404571732924


Ridge
Model Evaluation Score
PMSE 0.9071625523603883
Mean Absolute Error 805.8528904266923
Mean Squared Error 1217.0023445480392


ElasticNet
Model Evaluation Score
PMSE 0.8391941720350486
Mean Absolute Error 1064.5482654084396
Mean Squared Error 1601.6982872031358




In [33]:
print(model_list)
print(r2_score_list)

['LinearRegression', 'Lasso', 'Ridge', 'ElasticNet']
[0.9071631629908168, 0.9071414796792513, 0.9071625523603883, 0.8391941720350486]
