# Diamonds Price Prediction Project

### Importing relevant libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

### Relevant information

#### Files

- data.csv: training set
- test.csv: test set
- sample_submission.csv: sample submission

#### Features

- id: only for test & sample submission files, id for prediction sample identification
- price: price in USD
- carat: weight of the diamond
- cut: quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- color: diamond colour, from J (worst) to D (best)
- clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
- x: length in mm
- y: width in mm
- z: depth in mm
- depth: total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)
- table: width of top of diamond relative to widest point (43--95)

### Importing the training csv

In [None]:
#Importing the csv to Jupyter Notebook
training_df = pd.read_csv("../input/diamonds-datamad0120/diamonds_train.csv")
training_df.head()

In [None]:
#Checking the shape of the dataframe
training_df.shape

In [None]:
#Checking if there are missing values
training_df.isnull().sum()

In [None]:
# set the id column as index
training_df.set_index('id', inplace=True)

In [None]:
training_df.head()

In [None]:
#Check what are the values that could be 
for col in training_df.columns:
    print(f"**** {col} **** --> {training_df[col].unique()}")

### CONCLUSIONES

Después de ver los valores únicos de cada una de las columnas, se procederá como :

- Columna *'cut'*: Cada valor tiene una importancia, por lo que se va a proceder a reemplazar los valores por números.

- Columna *'color'*: A priori, ningún color tiene más importancia que otro, por lo que se usará la función get_dummies para conseguir que todos los valores tengan la misma importancia.

- Columna *'clarity'*: Cada valor tiene una importancia, por lo que se va a proceder a reemplazar los valores por números.

#### Columna *'cut'*

In [None]:
# List of values from 'cut' column
cuts_column=['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']

In [None]:
# Replacing all the values in order to get the importance for each value 
for x,y in enumerate(cuts_column):
    print(f"The value '{y}' now is {x}")
    training_df['cut'] = training_df['cut'].replace(y, x)

In [None]:
training_df['cut'].unique()

#### Columna *'clarity'*

In [None]:
# List of values from 'clarity' column
# clarity: a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))
clarity_column = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

In [None]:
# Replacing all the values in order to get the importance for each value 
for x,y in enumerate(clarity_column):
    print(f"The value '{y}' now is {x}")
    training_df['clarity'] = training_df['clarity'].replace(y, x)

In [None]:
training_df['clarity'].unique()

#### Columna *'color'*

In [None]:
# Replacing all the values in order to get the importance for each value 
color_column = ['J','I','H','G','F','E','D']

In [None]:
for x,y in enumerate(color_column):
    print(f"The value '{y}' now is {x}")
    training_df['color'] = training_df['color'].replace(y, x)

In [None]:
training_df['color'].unique()

In [None]:
training_df.head(3)

## Cuáles son las columnas con más relación?

In [None]:
from string import ascii_letters
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
corr = training_df.corr()

In [None]:
mask = np.triu(np.ones_like(corr, dtype=np.bool))

In [None]:
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True);

## Normalizing and Standardizing the data

In [None]:
# Original Data
X = training_df.drop(columns=["price"])
y = training_df['price']

In [None]:
# Standarized & Normalized "X"
pipeline = [StandardScaler(),Normalizer(),]
tr = make_pipeline(*pipeline)
Xpr = tr.fit_transform(X)
Xpr = pd.DataFrame(Xpr,columns = X.columns)

In [None]:
'''X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)'''

X_train, X_test, y_train, y_test = train_test_split(Xpr, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
result = pd.concat([Xpr, y], axis=1, sort=False)

In [None]:
corr = result.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True);

In [None]:
X_corr = Xpr[['carat','table','x','y','z']]
y_corr = y

In [None]:
Xpr

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_corr, y_corr, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## Using the selector

In order to check if the relevant columns are the same:

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=3, min_samples_leaf=3, random_state=111)
selector = RFECV(rf_reg, step=1, cv=5)

In [None]:
selector.fit(Xpr, y)

In [None]:
selector.n_features_

In [None]:
pd.Series(Xpr.columns)[selector.support_.tolist()]

####  Model training

In [None]:
list(training_df.columns)

In [None]:
X_training = Xpr[['carat','cut','color','clarity','depth','table']]
y_training = training_df['price']

In [None]:
X_training

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_training, y_training, test_size=0.2, random_state=42)

In [None]:
rf_reg.fit(X_train1, y_train1)

In [None]:
rf_reg.feature_importances_

In [None]:
print(rf_reg.score(X_train1, y_train1))
print(rf_reg.score(X_test1, y_test1))

In [None]:
print('METRIC SUMMARY')
print('MSE', mean_squared_error(y_test1, rf_reg.predict(X_test1)))
print('RMSE', np.sqrt(mean_squared_error(y_test1, rf_reg.predict(X_test1))))
print('MSLE', mean_squared_log_error(y_test1, rf_reg.predict(X_test1)))
print('MAE', mean_absolute_error(y_test1, rf_reg.predict(X_test1)))
print('R2', r2_score(y_test1, rf_reg.predict(X_test1)))