### initializing Notebook

In [30]:
import pandas as pd
import os

silent = globals().get("silent", False)


### Load the Data

In [31]:
from config import DATA_PATH

df = pd.read_csv(DATA_PATH + "/diamonds.csv")
if not silent:
    display(df.head())

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


# Diamond Dataset Description

### Overview

This dataset contains information about diamonds, including their physical properties and prices. The dataset consists of **53,940 rows** and **10 variables** (features). Here is a detailed description of each feature:

### Variables:

- **price**  
  Price of the diamond in US dollars (ranging from \$326 to \$18,823).

- **carat**  
  Weight of the diamond (ranging from 0.2 to 5.01 carats).

- **cut**  
  Quality of the diamond cut. Categories include:
  - Fair
  - Good
  - Very Good
  - Premium
  - Ideal

- **color**  
  Diamond color grade, ranging from J (worst) to D (best).

- **clarity**  
  A measure of the diamond's clarity, where lower values represent more inclusions and higher values represent clearer diamonds. Categories include:
  - I1 (worst)
  - SI2
  - SI1
  - VS2
  - VS1
  - VVS2
  - VVS1
  - IF (best)

- **x**  
  Length of the diamond in millimeters (ranging from 0 to 10.74 mm).

- **y**  
  Width of the diamond in millimeters (ranging from 0 to 58.9 mm).

- **z**  
  Depth of the diamond in millimeters (ranging from 0 to 31.8 mm).

- **depth**  
  Total depth percentage, calculated as \( \text{depth} = \frac{2 \times z}{x + y} \). Values range from 43% to 79%.

- **table**  
  The width of the top of the diamond relative to its widest point, ranging from 43% to 95%.


In [None]:
# # show and delete columns where all values are missing
# colsAllMissing = df.columns[df.isnull().all()]
# if not silent:
#     print(f"The Data contains the following collumns which have no values: {colsAllMissing.values}")
# df = df.drop(columns=colsAllMissing)

# if not silent:
#     print(f"\nBefore the column name transformation and the casting the dataframe had the following structure:\n{df.dtypes.to_frame(name='Data Type')}")

# def toCamelCase(s):
#     parts = s.split(' ')
#     return parts[0].lower() + ''.join(word.capitalize() for word in parts[1:])

# # Change column names to camel case
# df.columns = [toCamelCase(col.replace('_', ' ')) for col in df.columns]
# if not silent:
#     print(f"\nDataframe now has the columns: {df.columns.values}")

# # cast values
# df['startDate'] = pd.to_datetime(df['startDate'])
# df['name'] = df['name'].astype('str')
# df['measure'] = df['measure'].astype('category')
# df['measureInfo'] = df['measureInfo'].astype('category')
# df['geoTypeName'] = df['geoTypeName'].astype('category')

# print(f"\nThe dataframe has the following structure:\n{df.dtypes.to_frame(name='Data Type')}")
if not silent:
    print(f"Before doing anything to the dataset:\n{df.dtypes.to_frame(name='Data Type')}")
    display(df.head())

# show and delete columns where all values are missing
colsAllMissing = df.columns[df.isnull().all()]
if not silent:
    print(f"The Data contains the following collumns which have no values: {colsAllMissing.values}")
df = df.drop(columns=colsAllMissing)

# Delete the first column (unnecessary id)
df = df.drop(columns=df.columns[0])

if not silent:
    print(f"After deleting ID Row:\n{df.dtypes.to_frame(name='Data Type')}")
    display(df.head())

# casting values
df['cut'] = df['cut'].astype('string')
df['color'] = df['color'].astype('string')
df['clarity'] = df['clarity'].astype('string')


# print final structure
print(f"Final structure of the dataset:\n{df.dtypes.to_frame(name='Data Type')}")




Before doing anything to the dataset:
           Data Type
Unnamed: 0     int64
carat        float64
cut           object
color         object
clarity       object
depth        float64
table        float64
price          int64
x            float64
y            float64
z            float64


Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


The Data contains the following collumns which have no values: []
After deleting ID Row:
        Data Type
carat     float64
cut        object
color      object
clarity    object
depth     float64
table     float64
price       int64
x         float64
y         float64
z         float64


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


Final structure of the dataset:
              Data Type
carat           float64
cut              object
color    string[python]
clarity  string[python]
depth           float64
table           float64
price             int64
x               float64
y               float64
z               float64
