In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r'C:\diamond_price_prediction\notebooks\data\archive (16)\diamonds.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [7]:
x = df.drop("price",axis = 1)
y = df["price"]

In [12]:
#segregating numerical and categorical data
categorical_cols = x.select_dtypes(include = object).columns
numerical_cols = x.select_dtypes(exclude = object).columns

In [13]:
categorical_cols

Index(['cut', 'color', 'clarity'], dtype='object')

In [14]:
#define the custom ranking for each ordinal varible
cut_categories     = ['Fair','Good','Very Good','Premium','Ideal']
color_categories   = ['D','E','F','G','H','I','J']
clarity_categories = ['I1','SI2','SI1','VS2','VS1','VVS2','VVS1','IF']


In [15]:
from sklearn.impute import SimpleImputer        #handling missing values
from sklearn.preprocessing import StandardScaler #scaling the values to bring all columns into same scale
from sklearn.preprocessing import OrdinalEncoder #transforming categorical to numerical
from sklearn.pipeline import Pipeline             #
from sklearn.compose import ColumnTransformer


In [22]:
##numerical pipeline
num_pipeline = Pipeline(steps=[
                              ('imputer',SimpleImputer(strategy='median')),
                              ('scaler',StandardScaler())
                              ])
#categorical pipelines
cat_pipeline = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                               ('ordinalencoder',OrdinalEncoder(categories=[cut_categories,color_categories,clarity_categories])),
                               ('sclaer',StandardScaler())
                               ])
#combining numerical and categorical pipeline
preprocessor = ColumnTransformer([('num_pipeline',num_pipeline,numerical_cols),
                                  ('cat_pipeline',cat_pipeline,categorical_cols)])

In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state = 30)


In [25]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train),columns = preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.fit_transform(x_test),columns = preprocessor.get_feature_names_out())

In [26]:
preprocessor.get_feature_names_out()

array(['num_pipeline__carat', 'num_pipeline__depth',
       'num_pipeline__table', 'num_pipeline__x', 'num_pipeline__y',
       'num_pipeline__z', 'cat_pipeline__cut', 'cat_pipeline__color',
       'cat_pipeline__clarity'], dtype=object)

In [28]:
## model training
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error


In [29]:
regression = LinearRegression()
regression.fit(x_train,y_train)


In [30]:
regression.coef_

array([ 5126.99748892,  -122.10399799,   -62.94825835, -1013.10763933,
          37.30474228,   -18.89177019,   138.12242666,  -551.03496211,
         823.91860561])

In [31]:
regression.intercept_

3925.5933312145735

In [37]:
import numpy as np 
def evaluate_model(true,predicted):
    mae = mean_absolute_error(true,predicted)
    mse = mean_squared_error(true,predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true,predicted)
    return mae,mse,rmse,r2_square


In [40]:
models = {'LinearRegression':LinearRegression(),
           'Ridge':Ridge(),
            'Lasso':Lasso(),
             'ElasticNet':ElasticNet()}
trained_model_list = []
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    mae,mse,rmse,r2_square = evaluate_model(y_test,y_pred)
    

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    print("model training performance")

    print("mae",mae)
    print("mse",mse)
    print("rmse",rmse)
    print("r2_square",r2_square * 100)
    r2_list.append(r2_square)

    print('='*35)
    print('\n')


LinearRegression
model training performance
mae 802.3652449342082
mse 1483846.179460619
rmse 1218.132250398379
r2_square 90.74787745723947


Ridge
model training performance
mae 802.4729838194025
mse 1483809.2873901746
rmse 1218.1171074203721
r2_square 90.74810748779205


Lasso
model training performance
mae 803.5381482280383
mse 1483540.2090186856
rmse 1218.006653930382
r2_square 90.74978525338601


ElasticNet
model training performance
mae 1071.9620081851047
mse 2639006.55095974
rmse 1624.5019393524096
r2_square 83.5451865977761


