In [None]:
#importing the dataset
import numpy as np
import pandas as pd

diamond = pd.read_csv("/content/diamonds.csv")

In [None]:
#printing the first five rows
print(diamond.head())

   index  carat      cut color clarity  depth  table  price     x     y     z
0      1   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1      2   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2      3   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3      4   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4      5   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75


In [None]:
#description of the data
diamond["price"].describe()

count    53940.000000
mean      3932.799722
std       3989.439738
min        326.000000
25%        950.000000
50%       2401.000000
75%       5324.250000
max      18823.000000
Name: price, dtype: float64

In [None]:
#finding the most common "cut"
diamond["cut"].mode()

0    Ideal
Name: cut, dtype: object

In [None]:
#correlation between carat and price
diamond[["carat", "price"]].corr()
#higher the carat, higher the price - strong positive correlation

Unnamed: 0,carat,price
carat,1.0,0.921591
price,0.921591,1.0


In [None]:
#price of diamonds colour-wise
diamond[["price", "color"]].groupby("color").mean()

Unnamed: 0_level_0,price
color,Unnamed: 1_level_1
D,3169.954096
E,3076.752475
F,3724.886397
G,3999.135671
H,4486.669196
I,5091.874954
J,5323.81802


In [None]:
#finding the carat cut-wise
diamond[["carat", "cut"]].groupby("cut").mean()

Unnamed: 0_level_0,carat
cut,Unnamed: 1_level_1
Fair,1.046137
Good,0.849185
Ideal,0.702837
Premium,0.891955
Very Good,0.806381


In [None]:
#finding the number of diamonds under each type of "cut"
diamond["cut"].value_counts()

Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: cut, dtype: int64

In [None]:
#finding the minimum price of diamonds colour-wise
diamond[["price", "color"]].groupby("color").min()

Unnamed: 0_level_0,price
color,Unnamed: 1_level_1
D,357
E,326
F,342
G,354
H,337
I,334
J,335


In [None]:
#finding the maximum price of diamonds colour-wise
diamond[["price", "color"]].groupby("color").max()

Unnamed: 0_level_0,price
color,Unnamed: 1_level_1
D,18693
E,18731
F,18791
G,18818
H,18803
I,18823
J,18710


In [None]:
#quantile function on "depth"
diamond["depth"].quantile(0.85)

62.9

In [None]:
#checking for nan values in the dataset
diamond.isna()

Unnamed: 0,index,carat,cut,color,clarity,depth,table,price,x,y,z
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
53935,False,False,False,False,False,False,False,False,False,False,False
53936,False,False,False,False,False,False,False,False,False,False,False
53937,False,False,False,False,False,False,False,False,False,False,False
53938,False,False,False,False,False,False,False,False,False,False,False


In [None]:
labels = ["very low", "low", "moderate", "high", "very high"]
diamond["equalwidth"] = pd.qcut(diamond["price"], q = 5, labels = labels)
diamond["equalwidth"]

0        very low
1        very low
2        very low
3        very low
4        very low
           ...   
53935    moderate
53936    moderate
53937    moderate
53938    moderate
53939    moderate
Name: equalwidth, Length: 53940, dtype: category
Categories (5, object): ['very low' < 'low' < 'moderate' < 'high' < 'very high']

In [None]:
bins = [0, 5000, 10000, 15000, 20000]
labels = ["very low", "low", "high", "very high"]
diamond["equalwidth"] = pd.cut(diamond["price"], bins = bins, labels = labels)
diamond["equalwidth"]

0        very low
1        very low
2        very low
3        very low
4        very low
           ...   
53935    very low
53936    very low
53937    very low
53938    very low
53939    very low
Name: equalwidth, Length: 53940, dtype: category
Categories (4, object): ['very low' < 'low' < 'high' < 'very high']

In [None]:
!pip install category_encoders
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder



In [None]:
#encoding
#one hot
onehot = pd.get_dummies(diamond["cut"])
onehot

Unnamed: 0,Fair,Good,Ideal,Premium,Very Good
0,0,0,1,0,0
1,0,0,0,1,0
2,0,1,0,0,0
3,0,0,0,1,0
4,0,1,0,0,0
...,...,...,...,...,...
53935,0,0,1,0,0
53936,0,1,0,0,0
53937,0,0,0,0,1
53938,0,0,0,1,0


In [None]:
mapping = {"Fair" : 1, "Good" : 2, "Ideal" : 3, "Premium" : 4, "Very Good" : 5}
ordinal = diamond["cut"].map(mapping)
ordinal

0        3
1        4
2        2
3        4
4        2
        ..
53935    3
53936    2
53937    5
53938    4
53939    3
Name: cut, Length: 53940, dtype: int64

In [None]:
labelenc = LabelEncoder()
labelencoding = labelenc.fit_transform(diamond["cut"])
labelencoding

array([2, 3, 1, ..., 4, 3, 2])

In [None]:
countencoding = diamond["cut"].map(diamond['cut'].value_counts())
countencoding

0        21551
1        13791
2         4906
3        13791
4         4906
         ...  
53935    21551
53936     4906
53937    12082
53938    13791
53939    21551
Name: cut, Length: 53940, dtype: int64

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
import pandas as pd
import numpy as np

In [None]:
sd = StandardScaler()
sddata = sd.fit_transform(diamond[["carat", "depth"]])
sddataframe = pd.DataFrame(sddata)
sddataframe.columns = [["carat", "depth"]]
sddataframe

Unnamed: 0,carat,depth
0,-1.198168,-0.174092
1,-1.240361,-1.360738
2,-1.198168,-3.385019
3,-1.071587,0.454133
4,-1.029394,1.082358
...,...,...
53935,-0.164427,-0.662711
53936,-0.164427,0.942753
53937,-0.206621,0.733344
53938,0.130927,-0.523105


In [None]:
mm = MinMaxScaler()
mmdata = mm.fit_transform(diamond[["carat", "depth"]])
mmdataframe = pd.DataFrame(mmdata)
mmdataframe.columns = [["carat", "depth"]]
mmdataframe

Unnamed: 0,carat,depth
0,0.006237,0.513889
1,0.002079,0.466667
2,0.006237,0.386111
3,0.018711,0.538889
4,0.022869,0.563889
...,...,...
53935,0.108108,0.494444
53936,0.108108,0.558333
53937,0.103950,0.550000
53938,0.137214,0.500000


In [None]:
#mean scaling for carat
mean_carat = diamond["carat"].mean()
diamond["carat"] = diamond["carat"] - mean_carat
diamond["carat"]

0       -0.56794
1       -0.58794
2       -0.56794
3       -0.50794
4       -0.48794
          ...   
53935   -0.07794
53936   -0.07794
53937   -0.09794
53938    0.06206
53939   -0.04794
Name: carat, Length: 53940, dtype: float64

In [None]:
#median scaling for depth
actualmedian = diamond["depth"].median()
ownmedian = 58
medianfinal = 58/actualmedian
diamond["depth"] = diamond["depth"] * medianfinal
diamond["depth"]

0        57.718447
1        56.122977
2        53.401294
3        58.563107
4        59.407767
           ...    
53935    57.061489
53936    59.220065
53937    58.938511
53938    57.249191
53939    58.375405
Name: depth, Length: 53940, dtype: float64

In [None]:
#normalisation
from sklearn import preprocessing
diamondarray = np.array(diamond["carat"])
normL1 = preprocessing.normalize([diamondarray], norm = 'l1')
normL1

array([[-2.73733592e-05, -2.83373121e-05, -2.73733592e-05, ...,
        -4.72046534e-06,  2.99115809e-06, -2.31058302e-06]])

In [None]:
normL2 = preprocessing.normalize([diamondarray], norm = 'l2')
normL2

array([[-0.00515896, -0.00534063, -0.00515896, ..., -0.00088965,
         0.00056373, -0.00043547]])

In [None]:
normalise = preprocessing.normalize(diamond[["carat", "depth"]], norm = 'l2')
normalise = pd.DataFrame(normalise, columns = [["carat", "depth"]])
normalise

Unnamed: 0,carat,depth
0,-0.009839,0.999952
1,-0.010475,0.999945
2,-0.010635,0.999943
3,-0.008673,0.999962
4,-0.008213,0.999966
...,...,...
53935,-0.001366,0.999999
53936,-0.001316,0.999999
53937,-0.001662,0.999999
53938,0.001084,0.999999
