# 3. Training

### Import libraries

In [32]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib as plt
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import learning_curve, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn import metrics

Training 🏋️: Different trainings with Machine Learning on the dataset:
   1. Train-test-split method 
   2. LinearRegression 
   3. DecisionTreeRegressor 
   4. KNeighborsRegressor 
   5. GradientBoostingRegressor 
   6. RandomForestRegressor

In [4]:
train_data = pd.read_csv("../data/train_data.csv")
train_data

Unnamed: 0.1,Unnamed: 0,id,carat,depth,table,x,y,z,price,cut_score,...,color_H,color_I,color_J,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
0,0,0,1.01,62.7,58.0,6.36,6.31,3.97,8.754,4,...,0,0,0,0,0,0,1,0,0,0
1,1,1,0.60,64.3,57.0,5.31,5.38,3.43,7.492,2,...,0,0,0,0,0,0,0,1,0,0
2,2,2,1.60,59.4,59.0,7.55,7.60,4.50,9.010,4,...,1,0,0,0,0,1,0,0,0,0
3,3,3,0.30,63.5,58.0,4.27,4.24,2.70,6.227,3,...,0,0,1,0,0,0,1,0,0,0
4,4,4,1.00,62.8,57.0,6.37,6.44,4.02,8.314,3,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40450,40450,40450,0.51,62.5,55.0,5.08,5.13,3.19,7.412,5,...,0,0,0,0,0,0,0,1,0,0
40451,40451,40451,0.80,62.1,54.0,5.96,5.99,3.71,8.016,5,...,1,0,0,0,0,0,0,1,0,0
40452,40452,40452,1.53,58.7,59.0,7.51,7.45,4.39,9.070,4,...,0,0,0,0,0,1,0,0,0,0
40453,40453,40453,1.51,61.6,54.0,7.38,7.42,4.57,9.199,5,...,0,1,0,0,0,0,0,1,0,0


### 1. Train-Test-Split

In [5]:
X = train_data.drop(['price'], axis=1)
y= train_data["price"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=666)

### 2. LinearRegression

In [7]:
lr = LinearRegression()

In [8]:
lr.fit(X_train, y_train)

LinearRegression()

In [9]:
linear_pred = lr.predict(X_test)
linear_pred

array([7.24368766, 9.32494027, 7.99783761, ..., 7.94937553, 8.53650493,
       6.46303012])

In [10]:
mean_squared_error(y_test, linear_pred)

0.02626714823533572

### 3. DecisionTreeRegressor

#### 3.a. 1 BRANCH

In [25]:
model = DecisionTreeRegressor(max_depth=1)
model.fit(X_train, y_train)
tree_pred = model.predict(X_test)
mean_squared_error(y_test, tree_pred)

0.2855012597115086

#### 3.b. 3 BRANCHES

In [20]:
model = DecisionTreeRegressor(max_depth=3)
model.fit(X_train, y_train)
tree_pred = model.predict(X_test)
mean_squared_error(y_test, tree_pred)

0.0739329422015127

#### 3.c. 10 BRANCHES

In [21]:
model = DecisionTreeRegressor(max_depth=10)
model.fit(X_train, y_train)
tree_pred = model.predict(X_test)
mean_squared_error(y_test, tree_pred)

0.02664027358766726

#### 3.d. 20 BRANCHES (LOWER MSE)

In [22]:
model = DecisionTreeRegressor(max_depth=20)
model.fit(X_train, y_train)
tree_pred = model.predict(X_test)
mean_squared_error(y_test, tree_pred)

0.0189832140588556

#### 3.e. 30 BRANCHES (*OVERFITTING*)

In [23]:
model = DecisionTreeRegressor(max_depth=30)
model.fit(X_train, y_train)
tree_pred = model.predict(X_test)
mean_squared_error(y_test, tree_pred)

0.020675859774471343

### 4. KNeighborsRegressor

In [26]:
n_neighbors = np.arange(1, 100)

In [39]:
model = KNeighborsRegressor()
parameter_space = {'n_neighbors': n_neighbors}

grid_search = GridSearchCV(model, param_grid=parameter_space, cv=7)

grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
grid_search.best_score_

In [None]:
best_knr = grid_search.best_estimator_
best_knr.score(X_test, y_test)

In [None]:
knr_pred = best_knr.predict(X_test)

In [None]:
mean_squared_error(y_test, knr_pred)

### 5. GradientBoostingRegressor

In [33]:
model = GradientBoostingRegressor(n_estimators = 100)

params = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
          'max_depth': [4, 6],
          'min_samples_leaf': [3, 10, 17],
          'max_features': [3, 1, 5]}
grid_search = GridSearchCV(model, param_grid=params, cv=2, n_jobs=3, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 2 folds for each of 72 candidates, totalling 144 fits


GridSearchCV(cv=2, estimator=GradientBoostingRegressor(), n_jobs=3,
             param_grid={'learning_rate': [0.1, 0.05, 0.02, 0.01],
                         'max_depth': [4, 6], 'max_features': [3, 1, 5],
                         'min_samples_leaf': [3, 10, 17]},
             verbose=1)

In [34]:
grid_search.best_score_

0.9850044922617748

In [35]:
best_gb = grid_search.best_estimator_
best_gb.score(X_test, y_test)

0.986761256327414

In [36]:
grid_search.best_params_

{'learning_rate': 0.1,
 'max_depth': 6,
 'max_features': 5,
 'min_samples_leaf': 3}

In [37]:
boosting_pred = best_gb.predict(X_test)

In [38]:
mean_squared_error(y_test, boosting_pred)

0.013650691782998945

### 6. RandomForestRegressor

In [11]:
rf = RandomForestRegressor()

In [12]:
rf.fit(X_train, y_train)

RandomForestRegressor()

In [13]:
forest_pred = rf.predict(X_test)
forest_pred

array([7.11683, 9.0588 , 7.943  , ..., 7.93127, 8.5935 , 6.38697])

In [14]:
mean_squared_error(y_test, forest_pred)

0.010407299052070213