In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics


from sklearn.linear_model import LinearRegression
from sklearn. linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor


 
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading the training dataset

In [76]:
train_df = pd.read_csv('../input/shai-club/train.csv')
train_df.head()

#### 1. Looking for the dataset shape (rows * columns)

In [77]:
train_df.shape

#### 2. Looking for more info about the columns data type, and null values

In [78]:
train_df.info()

#### 3. Null values

In [79]:
train_df.isnull().sum()

#### 4. Duplicated values

In [80]:
train_df.duplicated().sum()

#### 5. Statistical description

In [81]:
train_df.describe()

### Feature Engineering

#### 1. Combining the diamond dimensions (X,Y, & Z) in one feature (size)

In [82]:
train_df['size']=train_df['x']*train_df['y']*train_df['z']
train_df['size'].head()

#### 2. Dropping the x, y, & z columns

In [83]:
train_df.drop(['x','y','z'],axis='columns', inplace=True)
train_df.head()

In [84]:
train_df.describe()

#### PLotting the size distribution 

In [85]:
plt.figure(figsize=(8,8))
plt.boxplot(data=train_df, x='size');


#### Size columns contains an outlier value (>3500) and zero values

#### Looking for a zero values !!!

In [86]:
train_df[train_df['size']==0]

#### Looking for the outlier size value

In [87]:
train_df.query('size > 3500')

### Exploring the dataset

In [88]:
train_df['cut'].value_counts()

In [89]:
train_df.groupby('cut')['price'].mean().sort_values()

In [90]:
train_df['clarity'].value_counts()

In [91]:
train_df.groupby('clarity')['price'].mean().sort_values()

In [92]:
train_df['color'].value_counts()

In [93]:
train_df.groupby('color')['price'].mean().sort_values()

#### Distribution of numerical variables

In [94]:
num_attribs = ['carat','depth','table','size','price']
train_df[num_attribs].hist(figsize=(12,8), bins=50);
plt.show()

#### Relation between Numerical features and Diamond price

In [95]:
plt.figure(figsize = [16, 5])
base_color = sns.color_palette()[0]

plt.subplot(1, 4, 1);
plt.scatter(data = train_df, x = 'carat', y = 'price');
plt.title('carat');

plt.subplot(1, 4, 2);
plt.scatter(data = train_df, x = 'depth', y = 'price');
plt.title('depth');

plt.subplot(1, 4, 3);
plt.scatter(data = train_df, x = 'table', y = 'price');
plt.title('table');

plt.subplot(1, 4, 4);
plt.scatter(data = train_df, x = 'size', y = 'price');
plt.title('size');


#### Diamond price against the carat and size

In [96]:
plt.scatter(data = train_df, x = 'carat', y = 'size', c = 'price')
plt.xlabel('carat')
plt.ylabel('size')
plt.title('Diamond price against the carat and size')
plt.colorbar();

In [97]:
#Size outliers
train_df[train_df['size']>3500]

In [98]:
# size and carat mismatch
train_df[train_df['size']>800]

#### Plotting the Price distibuation

In [99]:
sns.histplot(data=train_df, x='price',bins=50);

#### Correlation Matrix

In [100]:
plt.figure(figsize=(12,8))
sns.heatmap(train_df[num_attribs].corr(), annot = True,  cmap='viridis', center = 0);

#### Relation between qualitative variables and the diamond price

In [101]:
plt.figure(figsize = [16, 5])
base_color = sns.color_palette()[0]
plt.subplot(1, 3, 1);
sns.boxplot(data=train_df, x='cut', y='price', color=base_color);
plt.xticks(rotation=15);

plt.subplot(1, 3, 2);
sns.boxplot(data=train_df, x='color', y='price', color=base_color);


plt.subplot(1, 3, 3);
sns.boxplot(data=train_df, x='clarity', y='price', color=base_color);


### Data Cleaning (Size column outlier & zero values)

In [102]:
# Size Outlier
idx = ~(train_df['size']>3800)  

train_df = train_df[idx]

# Size zero values
h = train_df['size'].mean()
train_df.loc[train_df['size'] == 0,['size'] ]= h

train_df[train_df['size']>800]
lis=[10541,40663]
train_df.loc[lis,'carat']=5.1

In [103]:
train_df['size'].plot(kind='box')

### Train Test Split

##### Defining the independent and dependent variables

In [104]:
x_train = train_df.iloc[:,np.r_[1:7, 8]]

In [105]:
x_train.head()

In [106]:
x_train.shape

#### Target value

In [107]:
y_train = train_df.iloc[:,-2].values


#### Data preprocessing 

In [108]:

cat_attribs = ['color','cut','clarity']
num_attribs = ['depth','table','carat','size']
full_pipeline = ColumnTransformer([
        
        ("cat", OneHotEncoder(), cat_attribs),
        ("num", StandardScaler(), num_attribs)
    
    ])

In [109]:
x_train = full_pipeline.fit_transform(x_train)

In [110]:
x_train=x_train.toarray()

In [111]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(x_train, y_train, test_size = 0.2, random_state = 25)

In [112]:
print(X_train.shape, X_test.shape)

## Looking for the best regressor model

In [113]:
# Building pipelins of standard scaler and model for varios regressors.

pipeline_lr=Pipeline([("scalar1",StandardScaler()),
                     ("lr",LinearRegression())])

pipeline_lasso=Pipeline([("scalar2", StandardScaler()),
                      ("lasso",Lasso())])

pipeline_dt=Pipeline([("scalar3",StandardScaler()),
                     ("dt",DecisionTreeRegressor())])

pipeline_rf=Pipeline([("scalar4",StandardScaler()),
                     ("rf",RandomForestRegressor())])


pipeline_kn=Pipeline([("scalar5",StandardScaler()),
                     ("kn",KNeighborsRegressor())])


pipeline_xgb=Pipeline([("scalar6",StandardScaler()),
                     ("xgb",XGBRegressor())])

# List of all the pipelines
pipelines = [pipeline_lr, pipeline_lasso, pipeline_dt, pipeline_rf, pipeline_kn, pipeline_xgb]

# Dictionary of pipelines and model types for ease of reference
pipeline_dict = {0: "LinearRegression", 1: "Lasso", 2: "DecisionTree", 3: "RandomForest",4: "KNeighbors", 5: "XGBRegressor"}

# Fit the pipelines
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [114]:
cv_results_rms = []
for i, model in enumerate(pipelines):
    cv_score = cross_val_score(model, X_train,y_train,scoring="neg_root_mean_squared_error", cv=12)
    cv_results_rms.append(cv_score)
    print("%s: %f " % (pipeline_dict[i], -1 * cv_score.mean()))

#### Random forest regressor gave the best score of 566

#### Cross Validation

In [115]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [116]:
forest_score = RandomForestRegressor()
forest_score=cross_val_score(forest_score, X_train, y_train,scoring="neg_mean_squared_error", cv=10)

forest_rmse_score =np.sqrt(-forest_score)
display_scores(forest_rmse_score)

#### Fine tuning the hyperparameters

In [117]:
'''
param_grid = [

    {'random_state':[0],
     'max_featurs':[15],
     'n_estimator':[217]
    

]
print(datetime.now())## Grid Search function
grid_search = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv= 5,return_train_score=True)
grid_search.fit(x_train, y_train)## show end time
print(datetime.now())
'''

#### Training the Random forest model with the best values of the hyperparameters

In [118]:

forest_reg = RandomForestRegressor(n_estimators = 217, random_state = 0, max_features=15)
                             
forest_reg.fit(X_train, y_train)

In [119]:
y_pred = forest_reg.predict(X_test)

In [120]:
y_pred

In [121]:
# calculate the RMSE
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

### Testing Dataset

In [122]:
test_df = pd.read_csv('../input/shai-club/test.csv')
test_df.head()

In [123]:
test_df.shape

In [124]:
test_df.info()

In [125]:
test_df.isnull().sum()

In [126]:
test_df.duplicated().sum()

#### Creating size column

In [127]:
test_df['size']=test_df['x']*test_df['y']*test_df['z']
test_df['size'].head()

#### Dropping x, y, & z columns

In [128]:
test_df.drop(['x','y','z'],axis='columns', inplace=True)
test_df.head()

In [129]:
test_df.describe()

In [130]:
plt.scatter(data = test_df, x = 'carat', y = 'size')
plt.xlabel('carat')
plt.ylabel('size')
plt.title('Diamond price against the carat and size')
plt.colorbar();

In [131]:
test_df['size'].sort_values(ascending=False)

In [132]:
# Replacing the zero size with the mean
g = test_df['size'].mean()
test_df.loc[test_df['size'] == 0,['size'] ]= g

#### Creating Train set

In [133]:
test_sub= test_df.iloc[:,1:]

#### Data preprocessing

In [134]:
test_sub = full_pipeline.fit_transform(test_sub)

In [135]:
test_sub=test_sub.toarray()

### Training the Random Forest ML

In [137]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators = 217, random_state = 0, max_features=15)
                             
forest_reg.fit(X_train, y_train)

In [138]:
y_pred_sub = forest_reg.predict(test_sub)

In [139]:
y_pred_sub

In [140]:
prediction=pd.DataFrame({'Id':test_df['Id'],'price':y_pred_sub})

In [141]:
prediction.shape

In [142]:
prediction.head()

In [143]:
prediction.to_csv('submission.csv',index=False)