# ML train and test 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

%matplotlib inline 
import seaborn as sns 

#scipy and scikit-learn
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

print("done")

done


In [2]:
df = sns.load_dataset('diamonds')

In [3]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [4]:
df.shape

(53940, 10)

In [5]:
# Features do we want to use in training our model ?
# What values/column do we want to predict 

In [6]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [7]:
features = ["carat", "depth", "table", "x", "y", "z"]

X = df[features]

X.head(3)

Unnamed: 0,carat,depth,table,x,y,z
0,0.23,61.5,55.0,3.95,3.98,2.43
1,0.21,59.8,61.0,3.89,3.84,2.31
2,0.23,56.9,65.0,4.05,4.07,2.31


In [8]:
# what we want to predict

y = df["price"]


In [9]:
# You can train your model with the whole dataset!
# however, when that model encounter new data, it will likely fail 

# option 1 
# split dataset into 2 parts :  testing and traning parts.
# use train_test_split from sklearn 

In [10]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=2)

In [11]:
train_set.shape

(43152, 10)

In [12]:
test_set.shape

(10788, 10)

In [13]:
# drop unwanted features
X_train_full = train_set[features]
X_test_full = test_set[features]
X_train_full.head(2)

Unnamed: 0,carat,depth,table,x,y,z
34837,0.3,60.8,58.0,4.38,4.34,2.65
19858,1.06,61.2,57.0,6.52,6.56,4.0


In [14]:
y_train_full = train_set["price"]
y_test_full = test_set["price"]

In [15]:
model = DecisionTreeRegressor()
model.fit(X_train_full, y_train_full)
predictions = model.predict(X_test_full)

In [17]:
print(y_test_full, predictions)

30960      746
50149     2215
28326      666
22444    10472
2449      3189
         ...  
19054     7836
45609     1691
49278     2090
49260     2088
12793     5335
Name: price, Length: 10788, dtype: int64 [ 890.         1981.          627.         ... 1753.         3052.66666667
 5432.        ]


In [18]:
# enhance the model 
model = DecisionTreeRegressor(criterion='mae', random_state=2)
model.fit(X_train_full, y_train_full)
predictions = model.predict(X_test_full)

In [19]:
print(y_test_full, predictions)

30960      746
50149     2215
28326      666
22444    10472
2449      3189
         ...  
19054     7836
45609     1691
49278     2090
49260     2088
12793     5335
Name: price, Length: 10788, dtype: int64 [ 894.5 2559.   579.  ... 1340.  3219.  5629. ]


In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [26]:
# Using a Random forest regressor to Predict Churn 
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [29]:
# Define the models
model_1 = RandomForestRegressor(n_estimators=50, random_state=0)
model_2 = RandomForestRegressor(n_estimators=100, random_state=0)
model_3 = RandomForestRegressor(n_estimators=100, criterion='mae', random_state=0)
model_4 = RandomForestRegressor(n_estimators=200, min_samples_split=20, random_state=0)
model_5 = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [30]:
from sklearn.metrics import mean_absolute_error

# Function for comparing different models
def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return mean_absolute_error(y_v, preds)

for i in range(0, len(models)):
    mae = score_model(models[i])
    print("Model %d MAE: %d" % (i+1, mae))

Model 1 MAE: 780
Model 2 MAE: 777
Model 3 MAE: 773


KeyboardInterrupt: 

# models results
Model 1 MAE: 780

Model 2 MAE: 777

Model 3 MAE: 773