In [2]:
import pandas as pd 
import numpy as np 
import seaborn as sns

## Model Training

In [3]:
data = pd.read_csv("data/gemstone.csv")

In [4]:
## Drop Id colum not use full
data.drop("id",axis=1,inplace=True)

In [5]:
## Saprate Independent and Dependent Feature
x = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [6]:
## saprate numerical and categorical features
categorical_features = x.select_dtypes(include="object").columns
numerical_features = x.select_dtypes(exclude="object").columns
print(categorical_features)
print(numerical_features)

Index(['cut', 'color', 'clarity'], dtype='object')
Index(['carat', 'depth', 'table', 'x', 'y', 'z'], dtype='object')


In [7]:
# Define the custom ranking for each ordinal variable
cut_categories = ['Fair', 'Good', 'Very Good','Premium','Ideal']
color_categories = ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']

In [8]:
from sklearn.impute import SimpleImputer ## handeling missing values
from sklearn.preprocessing import  StandardScaler ## handaling feature scalling
from sklearn.preprocessing import OrdinalEncoder ## Ordinal Encoding
## Piplines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [9]:
## Numerical pipline
num_pipline = Pipeline(
    steps = [
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler",StandardScaler())
    ]
)

## Catigorical Pipline
cato_pipline = Pipeline(
    steps = [
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("ordinal",OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
        ("Scaler",StandardScaler())
    ]
)

preprossor = ColumnTransformer([

    ("num_pipline",num_pipline,numerical_features),
    ("cato_pipline",cato_pipline,categorical_features)

])

In [10]:
## Train Test Split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)

In [11]:
# Traisform data
X_train = preprossor.fit_transform(X_train)
X_test = preprossor.transform(X_test)

## Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

In [13]:
linear = LinearRegression()

In [14]:
linear.fit(X_train,y_train)

In [15]:
linear.coef_

array([ 6432.97591819,  -132.34206204,   -70.48787525, -1701.38593925,
        -494.17005097,   -76.32351645,    68.80035873,  -464.67990411,
         652.10059539])

In [16]:
linear.intercept_

3976.878738902296

In [17]:
y_predict = linear.predict(X_test)

In [18]:
y_predict

array([ 1616.03275998, 15104.13631181,  1727.49228115, ...,
        1878.27425152,  6295.06951547,  5976.94207688])

In [19]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [20]:
evaluate_model(y_test,y_predict)

(675.0758270067445, 1014.6296630375483, 0.9362906819996045)

In [21]:
## Train multiple models
models = {
    "LinearRegression":LinearRegression(),
    "Ridge":Ridge(),
    "Lesso":Lasso(),
    "Elastic":ElasticNet()
}

model_list = []
r2_list = []


for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    ## Make Prediction
    y_predict = model.predict(X_test)


    mae, rmse, r2_square = evaluate_model(y_test,y_predict)
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])


    print("model Traning performance")
    print("RMSE",rmse*100)
    print("MAE",mae*100)
    print("R2 Score", r2_square*100)
    print("*"*60)


LinearRegression
model Traning performance
RMSE 101462.96630375483
MAE 67507.58270067445
R2 Score 93.62906819996046
************************************************************
Ridge
model Traning performance
RMSE 101463.4323353442
MAE 67510.77629781349
R2 Score 93.6290096749163
************************************************************
Lesso
model Traning performance
RMSE 101465.91302750638
MAE 67624.21173665505
R2 Score 93.62869814082755
************************************************************
Elastic
model Traning performance
RMSE 153335.41245902312
MAE 106094.32977143009
R2 Score 85.44967219374031
************************************************************
