# Dataframe Test

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
import matplotlib as plt
import matplotlib.pyplot as plt

In [29]:
test_data = pd.read_csv("../data/test.csv", encoding='cp1252')
test_data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z
0,0,2.01,Ideal,H,SI1,61.9,57.0,8.14,8.05,5.01
1,1,0.49,Good,D,VS1,57.5,60.0,5.18,5.25,3.0
2,2,1.03,Premium,F,SI1,58.6,62.0,6.65,6.6,3.88
3,3,0.9,Very Good,E,SI1,63.0,56.0,6.11,6.15,3.86
4,4,0.59,Ideal,D,SI1,62.5,55.0,5.35,5.4,3.36


In [4]:
test_data.keys()

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z'],
      dtype='object')

In [5]:
test_data.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
dtype: int64

In [6]:
test_data.shape

(13485, 10)

- As we can see, there are no null values.

In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       13485 non-null  int64  
 1   carat    13485 non-null  float64
 2   cut      13485 non-null  object 
 3   color    13485 non-null  object 
 4   clarity  13485 non-null  object 
 5   depth    13485 non-null  float64
 6   table    13485 non-null  float64
 7   x        13485 non-null  float64
 8   y        13485 non-null  float64
 9   z        13485 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 1.0+ MB


### 1. Categorical values

#### 1.1. Cut

In [8]:
test_data.cut.unique()

array(['Ideal', 'Good', 'Premium', 'Very Good', 'Fair'], dtype=object)

In [9]:
cut_map = {
    "Ideal": 5,
    "Premium": 4,
    "Very Good": 3,
    "Good": 2,
    "Fair": 1 
}

In [12]:
test_data["cut_num"] = test_data.cut.map(cut_map)
test_data

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,cut_num
0,0,2.01,Ideal,H,SI1,61.9,57.0,8.14,8.05,5.01,5
1,1,0.49,Good,D,VS1,57.5,60.0,5.18,5.25,3.00,2
2,2,1.03,Premium,F,SI1,58.6,62.0,6.65,6.60,3.88,4
3,3,0.90,Very Good,E,SI1,63.0,56.0,6.11,6.15,3.86,3
4,4,0.59,Ideal,D,SI1,62.5,55.0,5.35,5.40,3.36,5
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.41,Good,F,I1,63.8,57.0,4.72,4.69,3.00,2
13481,13481,2.05,Very Good,H,VS2,63.5,62.0,8.05,7.91,5.07,3
13482,13482,2.22,Premium,I,VS2,60.5,59.0,8.41,8.37,5.08,4
13483,13483,0.51,Ideal,E,SI2,58.3,62.0,5.20,5.19,3.04,5


#### 1.2. Colour

In [21]:
test_data.color.unique()

array(['H', 'D', 'F', 'E', 'G', 'I', 'J'], dtype=object)

In [23]:
test_data = pd.get_dummies(test_data, columns=["color"], drop_first=True)

In [24]:
test_data.head()

Unnamed: 0,id,carat,cut,depth,table,x,y,z,cut_num,clarity_IF,...,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,color_E,color_F,color_G,color_H,color_I,color_J
0,0,2.01,Ideal,61.9,57.0,8.14,8.05,5.01,5,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0.49,Good,57.5,60.0,5.18,5.25,3.0,2,0,...,1,0,0,0,0,0,0,0,0,0
2,2,1.03,Premium,58.6,62.0,6.65,6.6,3.88,4,0,...,0,0,0,0,0,1,0,0,0,0
3,3,0.9,Very Good,63.0,56.0,6.11,6.15,3.86,3,0,...,0,0,0,0,1,0,0,0,0,0
4,4,0.59,Ideal,62.5,55.0,5.35,5.4,3.36,5,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
test_data_1 = test_data.drop("cut", axis=1)
test_data_1

Unnamed: 0,id,carat,depth,table,x,y,z,cut_num,clarity_IF,clarity_SI1,...,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2,color_E,color_F,color_G,color_H,color_I,color_J
0,0,2.01,61.9,57.0,8.14,8.05,5.01,5,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1,0.49,57.5,60.0,5.18,5.25,3.00,2,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2,1.03,58.6,62.0,6.65,6.60,3.88,4,0,1,...,0,0,0,0,0,1,0,0,0,0
3,3,0.90,63.0,56.0,6.11,6.15,3.86,3,0,1,...,0,0,0,0,1,0,0,0,0,0
4,4,0.59,62.5,55.0,5.35,5.40,3.36,5,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.41,63.8,57.0,4.72,4.69,3.00,2,0,0,...,0,0,0,0,0,1,0,0,0,0
13481,13481,2.05,63.5,62.0,8.05,7.91,5.07,3,0,0,...,0,1,0,0,0,0,0,1,0,0
13482,13482,2.22,60.5,59.0,8.41,8.37,5.08,4,0,0,...,0,1,0,0,0,0,0,0,1,0
13483,13483,0.51,58.3,62.0,5.20,5.19,3.04,5,0,0,...,0,0,0,0,1,0,0,0,0,0


In [27]:
test_data_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13485 entries, 0 to 13484
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            13485 non-null  int64  
 1   carat         13485 non-null  float64
 2   depth         13485 non-null  float64
 3   table         13485 non-null  float64
 4   x             13485 non-null  float64
 5   y             13485 non-null  float64
 6   z             13485 non-null  float64
 7   cut_num       13485 non-null  int64  
 8   clarity_IF    13485 non-null  uint8  
 9   clarity_SI1   13485 non-null  uint8  
 10  clarity_SI2   13485 non-null  uint8  
 11  clarity_VS1   13485 non-null  uint8  
 12  clarity_VS2   13485 non-null  uint8  
 13  clarity_VVS1  13485 non-null  uint8  
 14  clarity_VVS2  13485 non-null  uint8  
 15  color_E       13485 non-null  uint8  
 16  color_F       13485 non-null  uint8  
 17  color_G       13485 non-null  uint8  
 18  color_H       13485 non-nu

#### 1.3. Clarity

In [17]:
test_data.clarity.nunique()

8

In [18]:
test_data.clarity.unique()

array(['SI1', 'VS1', 'VS2', 'SI2', 'VVS1', 'IF', 'VVS2', 'I1'],
      dtype=object)

In [19]:
test_data = pd.get_dummies(test_data, columns=["clarity"], drop_first=True)

In [20]:
test_data.head()

Unnamed: 0,id,carat,cut,color,depth,table,x,y,z,cut_num,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,2.01,Ideal,H,61.9,57.0,8.14,8.05,5.01,5,0,1,0,0,0,0,0
1,1,0.49,Good,D,57.5,60.0,5.18,5.25,3.0,2,0,0,0,1,0,0,0
2,2,1.03,Premium,F,58.6,62.0,6.65,6.6,3.88,4,0,1,0,0,0,0,0
3,3,0.9,Very Good,E,63.0,56.0,6.11,6.15,3.86,3,0,1,0,0,0,0,0
4,4,0.59,Ideal,D,62.5,55.0,5.35,5.4,3.36,5,0,1,0,0,0,0,0


In [28]:
test_data_1.to_csv("../data/test_data.csv")