In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("data/gemstone.csv")
df.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,price
0,0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [3]:
df.drop("id",axis=1,inplace=True)

In [4]:
df.head(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55,13619
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05,13387
2,0.7,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.5,2772
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71,666
4,1.7,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77,14453


In [5]:
categoricalFeatures = [feature for feature in df.columns if df[feature].dtype == object ]
numericalFeatures = [feature for feature in df.columns if df[feature].dtype != object ]

print("categorical feateres are : ", categoricalFeatures)
print("numerical features are : ",numericalFeatures)

categorical feateres are :  ['cut', 'color', 'clarity']
numerical features are :  ['carat', 'depth', 'table', 'x', 'y', 'z', 'price']


In [6]:
X = df.drop("price",axis=1)
y = df[["price"]]
X

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
0,1.52,Premium,F,VS2,62.2,58.0,7.27,7.33,4.55
1,2.03,Very Good,J,SI2,62.0,58.0,8.06,8.12,5.05
2,0.70,Ideal,G,VS1,61.2,57.0,5.69,5.73,3.50
3,0.32,Ideal,G,VS1,61.6,56.0,4.38,4.41,2.71
4,1.70,Premium,G,VS2,62.6,59.0,7.65,7.61,4.77
...,...,...,...,...,...,...,...,...,...
193568,0.31,Ideal,D,VVS2,61.1,56.0,4.35,4.39,2.67
193569,0.70,Premium,G,VVS2,60.3,58.0,5.75,5.77,3.47
193570,0.73,Very Good,F,SI1,63.1,57.0,5.72,5.75,3.62
193571,0.34,Very Good,D,SI1,62.9,55.0,4.45,4.49,2.81


In [7]:
y

Unnamed: 0,price
0,13619
1,13387
2,2772
3,666
4,14453
...,...
193568,1130
193569,2874
193570,3036
193571,681


In [8]:
df[categoricalFeatures]

Unnamed: 0,cut,color,clarity
0,Premium,F,VS2
1,Very Good,J,SI2
2,Ideal,G,VS1
3,Ideal,G,VS1
4,Premium,G,VS2
...,...,...,...
193568,Ideal,D,VVS2
193569,Premium,G,VVS2
193570,Very Good,F,SI1
193571,Very Good,D,SI1


In [9]:
df["cut"].value_counts()

cut
Ideal        92454
Premium      49910
Very Good    37566
Good         11622
Fair          2021
Name: count, dtype: int64

In [10]:
df["color"].value_counts()

color
G    44391
E    35869
F    34258
H    30799
D    24286
I    17514
J     6456
Name: count, dtype: int64

In [11]:
df["clarity"].value_counts()

clarity
SI1     53272
VS2     48027
VS1     30669
SI2     30484
VVS2    15762
VVS1    10628
IF       4219
I1        512
Name: count, dtype: int64

In [12]:
cutCategory = ["Fair","Good","Very Good","Premium","Ideal"]
cutCategory

['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']

In [13]:
colorCategory = ["D","E","F","G","H","I","J"]
clarityCategory = ["I1", "SI2", "SI1", "VS2","VS1","VVS2","VVS1","IF"]

print(colorCategory)
print(clarityCategory)

['D', 'E', 'F', 'G', 'H', 'I', 'J']
['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']


In [14]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [15]:
## Numerical pipeline
numericalPipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
        ]
)

## Categorical Pipeline

categoricalPipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("ordinalEncoder",OrdinalEncoder(categories=[cutCategory,colorCategory,clarityCategory] ) ),
        ("standerScaler",StandardScaler() ) 
    ]
)


In [16]:
preprocessor = ColumnTransformer(
    [ 
    ("numerical pipeline", numericalPipeline, numericalFeatures),
    ("categorical pipeline", categoricalPipeline, categoricalFeatures) 
    ]
)

In [17]:
preprocessor

In [18]:
numericalPipeline

In [19]:
categoricalPipeline

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=27)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((135501, 9), (58072, 9), (135501, 1), (58072, 1))

In [22]:
X_train

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
103109,0.61,Premium,I,I1,62.4,59.0,5.44,5.39,3.38
61053,1.01,Ideal,G,SI2,61.6,55.0,6.49,6.47,3.99
163452,2.01,Premium,H,SI2,60.7,60.0,8.11,8.17,4.94
177879,0.30,Ideal,G,IF,61.9,56.0,4.35,4.32,2.68
134049,0.82,Premium,D,SI1,61.2,60.0,6.06,5.99,3.69
...,...,...,...,...,...,...,...,...,...
4848,1.00,Very Good,G,SI2,60.5,61.0,6.37,6.42,3.87
14879,0.60,Ideal,G,VS1,62.8,56.0,5.34,5.37,3.36
36680,1.51,Premium,H,VS2,61.4,58.0,7.41,7.36,4.54
118456,1.01,Ideal,H,VS2,61.8,56.0,6.43,6.45,3.98


In [23]:
categoricalFeatures

['cut', 'color', 'clarity']

In [24]:
numericalFeatures

['carat', 'depth', 'table', 'x', 'y', 'z', 'price']

In [25]:
numericalFeatures.remove("price")

In [26]:
preprocessor.fit_transform(X_train)

array([[-0.39128434,  0.53705837,  0.92384104, ..., -0.13286416,
         1.52762163, -1.98180243],
       [ 0.4747998 , -0.2021371 , -1.15802288, ...,  0.87295885,
         0.29826437, -1.31477755],
       [ 2.64001013, -1.03373201,  1.44430702, ..., -0.13286416,
         0.912943  , -1.31477755],
       ...,
       [ 1.55740497, -0.38693597,  0.40337506, ..., -0.13286416,
         0.912943  ,  0.0192722 ],
       [ 0.4747998 , -0.01733823, -0.6375569 , ...,  0.87295885,
         0.912943  ,  0.0192722 ],
       [ 1.55740497, -0.01733823,  0.40337506, ..., -0.13286416,
        -1.54577152, -1.31477755]])

In [27]:
preprocessor.get_feature_names_out()

array(['numerical pipeline__carat', 'numerical pipeline__depth',
       'numerical pipeline__table', 'numerical pipeline__x',
       'numerical pipeline__y', 'numerical pipeline__z',
       'categorical pipeline__cut', 'categorical pipeline__color',
       'categorical pipeline__clarity'], dtype=object)

In [28]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns= preprocessor.get_feature_names_out())

In [29]:
X_train

Unnamed: 0,numerical pipeline__carat,numerical pipeline__depth,numerical pipeline__table,numerical pipeline__x,numerical pipeline__y,numerical pipeline__z,categorical pipeline__cut,categorical pipeline__color,categorical pipeline__clarity
0,-0.391284,0.537058,0.923841,-0.248949,-0.300244,-0.224043,-0.132864,1.527622,-1.981802
1,0.474800,-0.202137,-1.158023,0.698370,0.680476,0.660500,0.872959,0.298264,-1.314778
2,2.640010,-1.033732,1.444307,2.159948,2.224203,2.038066,-0.132864,0.912943,-1.314778
3,-1.062500,0.075061,-0.637557,-1.232357,-1.271884,-1.239091,0.872959,0.298264,2.687372
4,0.063410,-0.571735,1.444307,0.310420,0.244601,0.225479,-0.132864,-1.545772,-0.647753
...,...,...,...,...,...,...,...,...,...
135496,0.453148,-1.218531,1.964773,0.590105,0.635073,0.486491,-1.138687,0.298264,-1.314778
135497,-0.412936,0.906656,-0.637557,-0.339170,-0.318406,-0.253044,0.872959,0.298264,0.686297
135498,1.557405,-0.386936,0.403375,1.528402,1.488662,1.458038,-0.132864,0.912943,0.019272
135499,0.474800,-0.017338,-0.637557,0.644237,0.662315,0.645999,0.872959,0.912943,0.019272


In [30]:
X_test

Unnamed: 0,numerical pipeline__carat,numerical pipeline__depth,numerical pipeline__table,numerical pipeline__x,numerical pipeline__y,numerical pipeline__z,categorical pipeline__cut,categorical pipeline__color,categorical pipeline__clarity
0,-0.456241,1.830650,-0.637557,-0.411347,-0.381971,-0.253044,-2.144510,-0.316414,-1.314778
1,1.622361,-0.756534,0.403375,1.582534,1.534066,1.443537,-0.132864,-0.316414,0.019272
2,-0.174763,0.721857,1.964773,0.039758,-0.018741,0.080472,-0.132864,0.298264,1.353322
3,-1.019195,-0.109738,-0.117091,-1.160180,-1.190157,-1.181088,-0.132864,0.298264,1.353322
4,0.951146,0.629458,-0.637557,1.068275,1.016464,1.081020,0.872959,0.298264,0.019272
...,...,...,...,...,...,...,...,...,...
58067,0.951146,0.259860,-0.637557,1.032187,1.061867,1.066519,0.872959,2.142300,0.019272
58068,-0.845979,-0.479335,-0.637557,-0.844407,-0.826927,-0.862073,0.872959,-0.316414,-1.314778
58069,-0.759370,-1.403330,0.403375,-0.718098,-0.690716,-0.818571,-1.138687,0.912943,2.687372
58070,1.535753,0.352260,1.444307,1.384048,1.434178,1.429037,-0.132864,0.298264,-1.314778


In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [32]:
models = {
    "Linear Regression" : LinearRegression(),
    "Lasso Regression" : Lasso(),
    "Ridge Regression" : Ridge(),
    "Elastic Regression" : ElasticNet(),
    "SVR" : SVR(),
    "Decision tree Regression" : DecisionTreeRegressor()
}

In [33]:
from sklearn.metrics import r2_score

In [34]:
scoreOfModel = {}

for i in models.keys():
    print(i)
    model = models[i]
    model.fit(X_train,y_train)
    y_predict = model.predict(X_test)
    
    score = r2_score(y_test,y_predict)
    scoreOfModel[i] = score   
    

Linear Regression
Lasso Regression
Ridge Regression
Elastic Regression
SVR


  y = column_or_1d(y, warn=True)


Decision tree Regression


In [35]:
scoreOfModel

{'Linear Regression': 0.9362719761206517,
 'Lasso Regression': 0.9362975259879628,
 'Ridge Regression': 0.9362726555226496,
 'Elastic Regression': 0.8555748927588572,
 'SVR': 0.7695530045828658,
 'Decision tree Regression': 0.9585534993618225}

In [36]:
## Lasso regression we get the highest accuracy 

In [37]:
## we use lasso regression