In [56]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pickle as pk

In [57]:
df = pd.read_csv('diamonds.csv')
df.columns

Index(['Unnamed: 0', 'carat', 'cut', 'color', 'clarity', 'depth', 'table',
       'price', 'x', 'y', 'z'],
      dtype='object')

In [58]:
df = df.drop('Unnamed: 0', axis=1)

In [59]:
df.shape

(53940, 10)

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [61]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [62]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [63]:
df.sample(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
4764,0.9,Good,D,SI1,63.9,57.0,3689,6.1,6.04,3.88
3038,0.3,Good,F,VS2,63.4,59.0,565,4.23,4.25,2.69


In [64]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [65]:
obj_cols = df.select_dtypes(include=['object']).columns
for col in obj_cols:
    print(f"Value counts for column '{col}':")
    print(df[col].value_counts())

Value counts for column 'cut':
cut
Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: count, dtype: int64
Value counts for column 'color':
color
G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: count, dtype: int64
Value counts for column 'clarity':
clarity
SI1     13065
VS2     12258
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: count, dtype: int64


In [66]:
le = LabelEncoder()
for cols in df[obj_cols]:
    df[cols] = le.fit_transform(df[cols])
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,5,5,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75


In [67]:
for col in obj_cols:
    print(f"Value counts for column '{col}':")
    print(df[col].value_counts())

Value counts for column 'cut':
cut
2    21551
3    13791
4    12082
1     4906
0     1610
Name: count, dtype: int64
Value counts for column 'color':
color
3    11292
1     9797
2     9542
4     8304
0     6775
5     5422
6     2808
Name: count, dtype: int64
Value counts for column 'clarity':
clarity
2    13065
5    12258
3     9194
4     8171
7     5066
6     3655
1     1790
0      741
Name: count, dtype: int64


In [68]:
x = df.drop(columns=['price'])
y = df['price']

In [69]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.20,random_state=42)

In [70]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [71]:
# regressor = LogisticRegression()
# regressor.fit(x_train,y_train)

In [72]:
# regressor.score(x_test,y_test)

In [73]:
from sklearn.linear_model import Ridge, Lasso
rd = Ridge()
rd.fit(x_train,y_train)
rd.score(x_test, y_test)

0.8851399931217373

In [74]:
ls = Lasso()
ls.fit(x_train,y_train)
ls.score(x_test, y_test)

0.8851198421699537

In [75]:
rd2 = Ridge(alpha = 2)
rd2.fit(x_train,y_train)
rd2.score(x_test, y_test)

0.8851401832876508

In [76]:
ls2 = Lasso(alpha=2)
ls2.fit(x_train,y_train)
ls2.score(x_test, y_test)

0.8851081603352668

In [77]:
ls3 = Lasso(alpha=3)
ls3.fit(x_train,y_train)
ls3.score(x_test, y_test)

0.8850841681444188

In [78]:
rd.predict(x_test)

array([ 362.55177926, 3356.41156074, 2139.53347262, ...,  618.17620198,
       7788.27506114, 4643.84555054], shape=(10788,))

In [79]:
pred_data = pd.DataFrame([[0.23, '2', '1', '3', 61.5, 55, 3.95, 3.98, 2.75]],
                          columns=['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z'])

In [80]:
scaler.transform(pred_data)

array([[-1.20048588, -0.54076197, -0.93662126, -0.48350433, -0.17017294,
        -1.10328299, -1.59135298, -1.53067954, -1.11468327]])

In [81]:
rd.predict(pred_data)



array([-23957.5206224])

In [82]:
pk.dump(rd, open("diamond_regressor.pkl", 'wb'))
pk.dump(scaler, open("diamond_scaler.pkl", 'wb'))