# <center>Class 15: Regression Trees </center>

#### Prerequisites

In order to be able to display tree objects you need to get `Graphviz` installed on your computer. 
- Installation instructions for Windows can be found [here](https://forum.graphviz.org/t/new-simplified-installation-procedure-on-windows/224). Make soure you choose the `Add Graphviz to the system PATH for current user` option.
- More on `Graphviz` [here](https://graphviz.org/about/)

Once you installed `Graphviz` you need to restart your computer.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from stargazer import stargazer
from statsmodels.tools.eval_measures import mse,rmse
from patsy import dmatrices

from sklearn.metrics import r2_score,mean_squared_error
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.tree import DecisionTreeRegressor
from collections import Counter
from IPython.display import Image, display

In [None]:
def seq(start, stop, by, round_n=3):
    return [round(x, round_n) for x in list(np.arange(start, stop, by))]

## Data

In [None]:
path = os.path.join(os.pardir, 'data', 'used-cars_2cities_prep.csv') # this will produce a path with the right syntax for your operating system
path

In [None]:
# DATA IMPORT - FROM FILE
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.info()

### EDA & feature engineering

**Filtering**

In [None]:
# Manage missing
df["fuel"] = df["fuel"].fillna("Missing")
df["drive"] = df["drive"].fillna("Missing")
df["cylinders"] = df["cylinders"].fillna("Missing")
df["transmission"] = df["transmission"].fillna("Missing")
df["type"] = df["type"].fillna("Missing")

In [None]:
df['condition'].value_counts()

In [None]:
df["condition"] = df["condition"].fillna("good")

In [None]:
df['condition'].value_counts()

In [None]:
# drop hybrid models then drop column
df = (df.query("Hybrid==0")).drop(["Hybrid"], axis=1)

# keep gas-fuelled vehicles
df = df.query("fuel=='gas'")
    # alternative: df = df[df.fuel == 'gas']]

# drop vehicles in fair and new condition, trucks
df = df.query("condition not in ['new','fair']")
    # alternative: df = df[~df.condition.isin(['new','fair'])]

In [None]:
# drop unrealistic values for price and odometer reading
df = df.query("price in @seq(500,25001, 1)").query("odometer<=100")

# drop if price is smaller than 1000 and condition is like new or age is less than 8
df = df.query("~(price < 1000 & (condition == 'like new'|age < 8))")

df = df.query("~(transmission=='manual')")

# drop if truck
df = df.query("type not in ['truck','pickup']")

# drop pricestr
df = df.drop(["pricestr"], axis=1)

In [None]:
# to be on the safe side
df = df[df["price"].notna()]
df.reset_index(drop = True, inplace = True )

In [None]:
df.shape

**Additional variables**

In [None]:
# condition
df["cond_excellent"] = np.where(df["condition"] == "excellent", 1, 0)
df["cond_good"] = np.where(df["condition"] == "good", 1, 0)
df["cond_likenew"] = np.where(df["condition"] == "like new", 1, 0)
# cylinders
df["cylind6"] = np.where(df["cylinders"] == "6 cylinders", 1, 0)
df.cylinders.value_counts()
df.cylind6.value_counts()
#chicago
df["chicago"] = np.where(df["area"] == "chicago", 1, 0)
# age: quadratic, cubic
df["agesq"] = df["age"] ** 2
df["agecu"] = df["age"] ** 3
# odometer quadratic
df["odometersq"] = df["odometer"] ** 3

In [None]:
df.price.describe().map('{:,.0f}'.format)

In [None]:
smp_size=np.floor(0.7*df.shape[0])
smp_size

In [None]:
# Initialize random seed to make it reproducible
random.seed(20250217)
train_ids = random.sample(range(0, df.shape[0]), int(smp_size))
train_ids[:10]

In [None]:
df["train"] = 0
df["train"][train_ids] = 1

In [None]:
df["train"].value_counts()

In [None]:
df_train = df.query('train==1')
df_test = df.query('train==0')

### CART using 'age' as a single predictor variable

#### A single split

In [None]:
df_train.price.describe().map('{:,.0f}'.format)

In [None]:
df_test.price.describe().map('{:,.0f}'.format)

In [None]:
cart1 = DecisionTreeRegressor(random_state = 20250217, max_depth = 1)

# Note X should be a matrix instead of series, that's why we need double []
X = df_train[['age']]
y = df_train['price']
cart1.fit(X,y)

In [None]:
pred_cart1 = cart1.predict(df_test[["age"]])

rmse_cart1 = np.sqrt(mean_squared_error(df_test["price"], pred_cart1))

In [None]:
from sklearn import tree

In [None]:
tree.plot_tree(cart1, filled = True, rounded = True, feature_names=["age"], fontsize = 10);

**Export as png**

In [None]:
!dot -Tpng tree.dot -o tree.png

In [None]:
pred_cart1t = cart1.predict(df_train[['age']])

In [None]:
cart1.tree_.threshold

In [None]:
cart1_cuts = cart1.tree_.threshold[cart1.tree_.threshold != -2]
cart1_cuts

In [None]:
df_plot = df_train
df_plot['predicted_price'] = pred_cart1t
df_plot.sort_values(by = 'age', inplace = True)

In [None]:
plt.figure(figsize = (8,5))
plt.scatter(x = df_plot.age, y = df_plot.price, marker = '.')
plt.hlines(df_plot.predicted_price.max(), 0, cart1_cuts, color = 'k')
plt.hlines(df_plot.predicted_price.min(), cart1_cuts, df_plot.age.max(), color = 'k', label = 'predicted')
plt.legend()
plt.ylabel('Price (USD)')
plt.xlabel('Age')
plt.title('Predicted and actual car prices')
plt.show();

### Splits at two levels (setting "max_depth" to 2)

In [None]:
cart2 = DecisionTreeRegressor(random_state=20250217, max_depth=2)
# Note X should be a matrix instead of series, that's why we need double []
X = df_train[["age"]]
y = df_train["price"]
cart2.fit(X, y)

In [None]:
plt.figure(figsize = (14,8))
tree.plot_tree(cart2, filled = True, rounded = True, feature_names=["age"], fontsize = 12, node_ids = True);

In [None]:
# Cut points
cuts = cart2.tree_.threshold[cart2.tree_.threshold != -2]
cuts

In [None]:
for x in cuts:
    print(int(np.floor(x)), int(np.floor(x) + 1))

In [None]:
# Groups
groups = [0]
for x in cuts:
    groups += [int(np.floor(x)), int(np.floor(x) + 1)]
    
groups = sorted(groups) + ["or more"]
groups

In [None]:
# get the leaf for each observations for the training sample
leaves_index = cart2.apply(X)
leaves_index # => these are all terminal leaves only!

In [None]:
# use Counter to find the number of elements on each leaf
cnt = Counter(sorted(leaves_index))
cnt

In [None]:
# and now you can index each input to get the number of elements
elems = [cnt[x] for x in leaves_index]
print(elems[:10], '\t', elems[-10:])

In [None]:
counts=list(cnt.values())
counts

In [None]:
av_price = (
    pd.DataFrame({"index": leaves_index, "fit": cart2.predict(X)})
    .drop_duplicates()
    .sort_values("index")["fit"]
    .tolist()
)
av_price

In [None]:
for x, y in enumerate(
                [
                    "Age " + str(groups[i]) + "-" + str(groups[i + 1])
                    for i in range(len(groups) - 1)
                ]):
    print(x, "   ", y)

In [None]:
pd.DataFrame(
    {
        "Category": [
            y
            for x, y in enumerate(
                [
                    "Age " + str(groups[i]) + "-" + str(groups[i + 1])
                    for i in range(len(groups) - 1)
                ]
            )
            if x % 2 == 0
        ],
        "Count": counts,
        "Average_price": av_price,
    }
)

In [None]:
pred_cart2 = cart2.predict(df_test[["age"]])

rmse_cart2 = np.sqrt(mean_squared_error(df_test["price"], pred_cart2))

In [None]:
print(f'CART2 RMSE: {rmse_cart2:,.1f}')

In [None]:
pred_cart2

In [None]:
df_plot = df_train
df_plot['predicted_price'] = cart2.predict(df_train[["age"]])
df_plot.sort_values(by = 'age', inplace = True)

In [None]:
df_plot[['age', 'predicted_price']]

In [None]:
plt.figure(figsize = (8,5))
plt.scatter(x = df_plot.age, y = df_plot.price, marker = '.')
for price in df_plot.predicted_price.unique():
    xmin = df_plot[df_plot.predicted_price == price].age.min() -0.5
    xmax = df_plot[df_plot.predicted_price == price].age.max() + 0.5
    plt.hlines(price, xmin, xmax, color = 'k')
# plt.legend()
plt.ylabel('Price (USD)')
plt.xlabel('Age')
plt.title('Predicted and actual car prices')
plt.show();

#### Splitting using the *complexity parameter*

**Note**: 
- `min_impurity_decrease` in sklearn is considered to be the same as complexity parameter; the actual values are different but the purpose is the same
- `squared error` as *criterion* (measuring the quality of the split) stands for `mean squared error`. There is no metric targeting $R^2$.

In [None]:
cart3 = DecisionTreeRegressor(
    random_state=20250217, criterion="squared_error", min_impurity_decrease=50000
)
# Note X should be a matrix instead of series, that's why we need double []
X = df_train[["age"]]
y = df_train["price"]
cart3.fit(X, y)

In [None]:
pred_cart3 = cart3.predict(df_test[["age"]])

rmse_cart3 = np.sqrt(mean_squared_error(df_test["price"], pred_cart3))

In [None]:
print(f'CART3 RMSE: {rmse_cart3:,.1f}')

In [None]:
sorted([x for x in cart3.tree_.threshold if x > -2])

In [None]:
plt.figure(figsize = (40,20))
tree.plot_tree(cart3, filled = True, rounded = True, feature_names=["age"], fontsize = 12, node_ids = True);

In [None]:
pred_cart3t = cart3.predict(df_train[["age"]])

In [None]:
df_plot = df_train
df_plot['predicted_price'] = pred_cart3t
df_plot.sort_values(by = 'age', inplace = True)

In [None]:
plt.figure(figsize = (8,5))
plt.scatter(x = df_plot.age, y = df_plot.price, marker = '.')
for price in df_plot.predicted_price.unique():
    xmin = df_plot[df_plot.predicted_price == price].age.min() -0.5
    xmax = df_plot[df_plot.predicted_price == price].age.max() + 0.5
    plt.hlines(price, xmin, xmax, color = 'k')
# plt.legend()
plt.ylabel('Price (USD)')
plt.xlabel('Age')
plt.title('Predicted and actual car prices')
plt.show();

#### Age only linear regression
---

In [None]:
linreg1=smf.ols("price~age",data=df_train).fit()
print(linreg1.summary())

In [None]:
linreg1.resid.plot(kind = 'hist', bins = 20, rwidth = 0.9);

In [None]:
pred_linreg1 = linreg1.predict(df_test)
rmse_ols1 = np.sqrt(mean_squared_error(df_test["price"], pred_linreg1))
rmse_ols1

In [None]:
## Scatterplot with predicted values
pred_linreg1t=linreg1.predict(df_train)

In [None]:
df_plot = df_train
df_plot['predicted_price'] = pred_linreg1t
df_plot.sort_values(by = 'age', inplace = True)

In [None]:
plt.figure(figsize = (8,5))
plt.scatter(x = df_plot.age, y = df_plot.price, marker = '.')
plt.plot(df_plot.age, df_plot.predicted_price, color = 'k')
plt.ylabel('Price (USD)')
plt.xlabel('Age')
plt.title('Predicted and actual car prices')
plt.show();

#### Loess using `statsmodels`

In [None]:
lowess = sm.nonparametric.lowess

In [None]:
lowess1=lowess(df_train.price,df_train.age,)
#lowess1=lowess(df_train.price,df_train.age)

In [None]:
pred_lowess1 = lowess(df_test.price, df_test.age)
pred_lowess1 = [x[1] for x in pred_lowess1]
rmse_lowess1 = np.sqrt(mean_squared_error(df_test["price"], pred_lowess1))

pred_lowess1t = lowess(df_train.price, df_train.age)
pred_lowess1t = [x[1] for x in pred_lowess1t]

In [None]:
df_lowess = df_train.copy()
df_lowess["pred_lowess1t"] = pred_lowess1t
df_lowess.sort_values(by = 'age', inplace = True)

In [None]:
plt.figure(figsize = (8,5))
plt.scatter(x = df_lowess.age, y = df_lowess.price, marker = '.')
plt.plot(df_lowess.age, df_lowess.pred_lowess1t, color = 'k')
plt.ylabel('Price (USD)')
plt.xlabel('Age')
plt.title('Predicted and actual car prices - lowess')
plt.show();

### CART using multiple predictor variables

#### Linear regression baselines

In [None]:
# Linear regression with multiple variables
model2 = "price ~ age + odometer + LE + XLE + SE + cond_excellent + cond_good + cylind6 + dealer + chicago"
linreg2 = smf.ols(model2, df_train).fit()
print(linreg2.summary())

In [None]:
pred_linreg2 = linreg2.predict(df_test)
rmse_linreg2 = np.sqrt(mean_squared_error(df_test["price"], pred_linreg2))
rmse_linreg2

In [None]:
# add squared for age, odometer
model3 = "price ~ age + agesq+ odometer+odometersq +LE + XLE + SE + cond_excellent + cond_good + cylind6 + dealer+chicago"
linreg3 = smf.ols(model3, df_train).fit()
print(linreg3.summary())

In [None]:
pred_linreg3 = linreg3.predict(df_test)
rmse_linreg3 = np.sqrt(mean_squared_error(df_test["price"], pred_linreg3))
rmse_linreg3

#### Trees

In [None]:
y, X = dmatrices(model2, df_train)

**Splitting at four levels, for illustrative purposes (by setting "maxdepth" to 3)**

In [None]:
cart4 = DecisionTreeRegressor(
    random_state=20250217, criterion="squared_error",max_depth=3
)
cart4.fit(X, y)

In [None]:
y_test, X_test = dmatrices(model2, df_test)

pred_cart4 = cart4.predict(X_test)
rmse_cart4 = np.sqrt(mean_squared_error(y_test, pred_cart4))
rmse_cart4

In [None]:
feature_names_model2 = [
    "price",
    "age",
    "odometer",
    "LE",
    "XLE",
    "SE",
    "cond_excellent",
    "cond_good",
    "cylind6",
    "dealer",
    "chicago",
]

In [None]:
plt.figure(figsize = (35,15))
tree.plot_tree(cart4, filled = True, rounded = True, feature_names= feature_names_model2, fontsize = 16);

**Alternative: using of min_impurity_decrease**

In [None]:
cart4 = DecisionTreeRegressor(
    random_state=20250217,
    criterion="squared_error",
    min_impurity_decrease=145000,
    min_samples_split=20,
)
cart4.fit(X, y)

y_test, X_test = dmatrices(model2, df_test)

pred_cart4 = cart4.predict(X_test)
rmse_cart4 = np.sqrt(mean_squared_error(y_test, pred_cart4))
rmse_cart4

In [None]:
cart5 = DecisionTreeRegressor(
    random_state=20270217,
    criterion="squared_error",
    min_impurity_decrease=20000,
)
cart5.fit(X, y)

In [None]:
pred_cart5 = cart5.predict(X_test)
rmse_cart5 = np.sqrt(mean_squared_error(y_test, pred_cart5))
rmse_cart5

In [None]:
plt.figure(figsize = (35,15))
tree.plot_tree(cart5, filled = True, rounded = True, feature_names= feature_names_model2, fontsize = 12);

**Build very large tree and prune it**

Set the `ccp_alpha` [parameter](https://scikit-learn.org/stable/modules/tree.html#minimal-cost-complexity-pruning).

In [None]:
cart6 = DecisionTreeRegressor(
    random_state=20250217, min_samples_split=4, criterion="squared_error",ccp_alpha=30000
)
cart6.fit(X, y)

In [None]:
pred_cart6 = cart6.predict(X_test)
rmse_cart6 = np.sqrt(mean_squared_error(y_test, pred_cart6))
rmse_cart6

In [None]:
plt.figure(figsize = (35,15))
tree.plot_tree(cart6, filled = True, rounded = True, feature_names= feature_names_model2, fontsize = 12);

#### Plot variable importance for model 6

In [None]:
cart6.feature_importances_

In [None]:
cart6.feature_importances_.sum()

In [None]:
df_cart6_var_imp = (
    pd.DataFrame(
        {'variable': feature_names_model2, 
         'importance': cart6.feature_importances_}
    ).sort_values(
        by=["importance"], ascending=False
    ).reset_index(drop = True)
)

In [None]:
df_cart6_var_imp

Using Pandas `plot` method.

In [None]:
df_ = df_cart6_var_imp.sort_values(by = 'importance', ascending = True)

In [None]:
df_.importance = df_.importance *100

In [None]:
import matplotlib.ticker as mtick

plt.figure()
ax = df_.plot(
    kind = 'barh', x = 'variable', y = 'importance', 
    legend = False, grid = True, 
    xlabel = 'importance', ylabel = 'variable', 
    title = 'Variable importance for model CART6')
ax.set_xticks([x for x in range(0,100, 10)])
ax.xaxis.set_major_formatter(mtick.PercentFormatter())
plt.plot();

In [None]:
pd.DataFrame(
    {
        "Model": ["CART M" + str(i) for i in range(1, 7)]
        + ["OLS M" + str(i) for i in range(1, 4)],
        "Number of variables": [1, 1, 7, 7, 7, 7, 1, 7, 7],
        "Model details": [
            "2 levels",
            "3 levels",
            "min_impurity_decrease=50000",
            "min_impurity_decrease=140000 & min_samples_split=20",
            "min_impurity_decrease=20000",
            "ccp_alpha=30000",
            "linear",
            "linear",
            "w/ polynomial terms",
        ],
        "RMSE": [
            rmse_cart1,
            rmse_cart2,
            rmse_cart3,
            rmse_cart4,
            rmse_cart5,
            rmse_cart6,
            rmse_ols1,
            rmse_linreg2,
            rmse_linreg3,
        ],
    }
).set_index("Model")

### Build a Simple OLS Based on CART Feature Importances

In [None]:
# add squared for age, odometer
model_final = "price ~ age + agesq+ odometer+odometersq"
linreg_final = smf.ols(model_final, df_train).fit()
print(linreg_final.summary())

In [None]:
pred_linreg_final = linreg_final.predict(df_test)
rmse_linreg_final = np.sqrt(mean_squared_error(df_test["price"], pred_linreg_final))

In [None]:
print(f'{rmse_linreg_final:,.1f}')

In [None]:
linreg_final.params.map('{:,.1f}'.format)