In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Loading and exploring data

In [None]:
df = pd.read_csv('/kaggle/input/diamonds/diamonds.csv')
df.head()

In [None]:
df.columns

**Since the author mentioned that there are some data duplicates detected lets get rid of duplicated data**

In [None]:
df.duplicated(subset=['carat', 'cut', 'color', 'clarity', 'price'], keep='first').sum()

In [None]:
df = df.drop_duplicates(subset=['carat', 'cut', 'color', 'clarity', 'price'])
df.head()

In [None]:
df.describe(include='all')

In [None]:
df['cut'].unique()

**I prefer to label encode the cut myself as it is ordinal data (means ranking matters). The cut grading is further explained on this website https://www.loosediamondsreviews.com/diamondcut.html**

In [None]:
cut_grades = {
    "b'Fair'": 0,
    "b'Good'": 1,
    "b'Very Good'": 2,
    "b'Premium'": 3,
    "b'Ideal'": 4
}
cut_grades

In [None]:
df['clarity'].unique()

**I will also encode clarity myself as per cut grades. Clarity grading is further explained on this website https://www.americangemsociety.org/buying-diamonds-with-confidence/4cs-of-diamonds/understanding-diamond-clarity-the-4cs-of-diamonds/**

In [None]:
clarity_grades = {
    "b'I1'": 0,
    "b'SI2'": 1,
    "b'SI1'": 2,
    "b'VS2'": 3,
    "b'VS1'": 4,
    "b'VVS2'": 5,
    "b'VVS1'": 6,
    "b'IF'": 7
}
clarity_grades

In [None]:
df['color'].unique()

In [None]:
df['cut_encoded'] = df['cut'].map(cut_grades)
df['clarity_encoded'] = df['clarity'].map(clarity_grades)
df.head()

**As for color as there is no ranking involved I'll encode the color**

In [None]:
color_encoded = pd.get_dummies(df['color'], drop_first=True, prefix='color')
color_encoded.head()

In [None]:
df = pd.concat([df, color_encoded], axis='columns')
df.head()

In [None]:
df = df.drop(['cut', 'color', 'clarity'], axis='columns')
df.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
sns.pairplot(df)

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(df.corr(), annot=True)

**From the 2 graphs above it can be seen that there is significant correlation between carat, x, y, and z with price**

# 2. Preparing data for machine learning

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [None]:
X = df.drop('price', axis='columns')
y = df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2023)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
scaler = MinMaxScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Model exploration

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, ElasticNet, SGDRegressor, BayesianRidge
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from xgboost.sklearn import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
models = {
    'GB Regressor': GradientBoostingRegressor(),
    'RF Regressor': RandomForestRegressor(),
    'Linear Regression': LinearRegression(),
    'ElasticNet': ElasticNet(),
    'SGD Regressor': SGDRegressor(),
    'Bayesian Ridge': BayesianRidge(),
    'SVR': SVR(),
    'CatBoost': CatBoostRegressor(),
    'Kernel Ridge': KernelRidge(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor()
}
models

In [None]:
model_names = []
model_mean_mses = []
for model_name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, scoring='neg_mean_squared_error')
    model_names.append(model_name)
    model_mean_mses.append(-scores.mean())
    print(f'Model mean MSE calculation completed for {model_name}')

In [None]:
model_scores = pd.DataFrame()
model_scores['model'] = model_names
model_scores['mean mse'] = model_mean_mses
model_scores

**From the model comparison CatBoostRegressor has the lowest MSE. Therefore, we will proceed with CatBoostRegressor**

In [None]:
final_model = CatBoostRegressor()

In [None]:
final_model.fit(X_train_scaled, y_train)

# 4. Model validation using test data

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr

In [None]:
y_predicted = final_model.predict(X_test_scaled)

In [None]:
mse = mean_squared_error(y_test, y_predicted)
mae = mean_absolute_error(y_test, y_predicted)
correlation_stats = spearmanr(y_test, y_predicted)

In [None]:
print(f'MSE = {mse:.4f}, MAE = {mae:.4f}, Correlation = {correlation_stats.correlation}, Correlation P-value = {correlation_stats.pvalue}')