# <center>Class 13: Multiple Linear Regression </center>

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import numpy as np
import sys
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from stargazer import stargazer
from statsmodels.tools.eval_measures import mse,rmse

## Data

In [None]:
path = os.path.join(os.pardir, 'data', 'used-cars_2cities_prep.csv') # this will produce a path with the right syntax for your operating system
path

In [None]:
# DATA IMPORT - FROM FILE
df = pd.read_csv(os.path.join(path))

In [None]:
df.head()

In [None]:
df.info()

### EDA

In [None]:
df.area.value_counts()

In [None]:
# SAMPLE DESIGN

# Manage missing
df["fuel"] = df["fuel"].fillna("Missing")
df["condition"] = df["condition"].fillna("Missing")
df["drive"] = df["drive"].fillna("Missing")
df["cylinders"] = df["cylinders"].fillna("Missing")
df["transmission"] = df["transmission"].fillna("Missing")
df["type"] = df["type"].fillna("Missing")

In [None]:
# drop hybrid models then drop column
df = df[df.Hybrid == 0].drop(
    ["Hybrid"], axis=1
) 

In [None]:
df.shape

In [None]:
# check frequency by fuel type
freq = df.groupby("fuel").agg(frequency=("type", "size"))

In [None]:
freq

Some options for using the `agg()` method for various aggregations:

```python
df.groupby('fuel').agg(
    count = ('type', 'size'), 
    average = ('type', 'mean'), 
    sum = ('type', 'sum'), 
    minimum = ('type', 'min'), 
    maximum = ('type', 'max'), 
    stdev = ('type', 'std'), 
    median = ('type', 'median')
)

In [None]:
freq["percent"] = round(freq["frequency"] / sum(freq["frequency"]) * 100, 3)
freq["cumulative_percent"] = np.cumsum(freq["percent"])
freq

In [None]:
# keep gas-fuelled vehicles
df = df[df.fuel == "gas"]

In [None]:
# check frequency by vehicle condition
freq = df.groupby("condition").agg(frequency=("type", "size"))
freq["percent"] = round(freq["frequency"] / sum(freq["frequency"]) * 100, 3)
freq["cumulative_percent"] = np.cumsum(freq["percent"])
freq

In [None]:
# drop vehicles in fair and new condition
df = df[~df.condition.isin(["new", "fair"])]

# drop unrealistic values for price and odometer reading
df = df[(df.price >= 500) & (df.price <= 25000) & (df.odometer <= 100)]

# drop if price is smaller than 1000 and condition is like new or age is less than 8
df = df[
    ~((df.price < 1000) & ((df.condition == "like new") | (df.age < 8)))
]

In [None]:
# check frequency by transmission
freq = df.groupby("transmission").agg(frequency=("type", "size"))
freq["percent"] = round(freq["frequency"] / sum(freq["frequency"]) * 100, 3)
freq["cumulative_percent"] = np.cumsum(freq["percent"])
freq

In [None]:
df = df[~(df.transmission == "manual")]

In [None]:
# check frequency by type
freq = df.groupby("type").agg(frequency=("type", "size"))
freq["percent"] = round(freq["frequency"] / sum(freq["frequency"]) * 100, 3)
freq["cumulative_percent"] = np.cumsum(freq["percent"])
freq


In [None]:
# drop pricestr
df = df.drop(["pricestr"], axis=1)

### Feature Engineering

In [None]:
# condition; what is the baseline?
df["cond_excellent"] = np.where(df["condition"] == "excellent", 1, 0)
df["cond_good"] = np.where(df["condition"] == "good", 1, 0)
df["cond_likenew"] = np.where(df["condition"] == "like new", 1, 0)

In [None]:
# cylinders
df["cylind6"] = np.where(df["cylinders"] == "6 cylinders", 1, 0)

In [None]:
df.cylinders.value_counts()

In [None]:
df.cylind6.value_counts()

In [None]:
# age: quadratic, cubic
df["agesq"] = df["age"] ** 2
df["agecu"] = df["age"] ** 3

In [None]:
# odometer quadratic
df["odometersq"] = df["odometer"] ** 2

### Frequency tables

In [None]:
# area
df.groupby("area").agg(frequency=("price", 'size'), mean=("price", np.mean)).style.format({'mean':'{:,.1f}'})

Another way to calculate multiple aggregations:

In [None]:
# area
df.groupby("area").agg({'price': ['count', 'mean']}).style.format({('price','mean'):'{:,.1f}'}) # what does ('price','mean') stand for?

In [None]:
# focus only on Chicago
df = df[df.area == "chicago"]

In [None]:
# condition
df.groupby("condition").agg(frequency=("price", "size"), mean=("price", np.mean)).style.format({'mean':'{:,.1f}'})

In [None]:
# drive
df.groupby("drive").agg(frequency=("price", "size"), mean=("price", np.mean)).style.format({'mean':'{:,.1f}'})

In [None]:
# dealer
df.groupby("dealer").agg(frequency=("price", "size"), mean=("price", np.mean)).style.format({'mean':'{:,.1f}'})

In [None]:
# df summary
df[[
    "age",
    "odometer",
    "LE",
    "XLE",
    "SE",
    "cond_likenew",
    "cond_excellent",
    "cond_good",
    "cylind6",
    ]].describe().T

### Charts

We are using multiple ways to plot certain charts in this notebook. Python's primary plotting library is `matplotlib`(https://matplotlib.org/), which is very straightforward to start with but can easily be overwhelming when it comes to intricacies. A good intro can be found [here](https://fritz.ai/introduction-to-matplotlib-data-visualization-in-python/). 

There are multiple other plotting tools and libraries, most of which are some sort of wrapper around `matplotlib`. `seaborn` is a library for [analytical and statistical graphics](https://seaborn.pydata.org/tutorial/introduction.html), but sometimes it is sufficient to use `Pandas` `plot()` method for quick and simple charts.

In [None]:
# For certain charts, we need to sort values by age

df.sort_values(by = 'age', inplace = True)

In [None]:
# using Pandas plot()
# tedious to plot relative frequencies
df.plot(
    kind = 'hist', figsize = (10,6),
    y = 'price', bins = range(0, df.price.max(), 1000),
    xticks = range(0, df.price.max(), 2000),
    rwidth = 0.9, legend = False, 
    xlabel = 'price in USD', title = 'Absolute frequency by prices')
plt.show();

In [None]:
# relative frequencies with matplotlib
from matplotlib.ticker import PercentFormatter
fig = plt.figure(figsize = (10,6))
ax = fig.add_subplot(111)
ax.hist(df.price,range(0, df.price.max(), 1000), density = True, rwidth = 0.9, color = 'steelblue')
ax.set_xticks(range(0, df.price.max(), 2000))
ax.set_xlabel('price in USD')
ax.yaxis.set_major_formatter(PercentFormatter(xmax=0.001, decimals = 0))
ax.set_title('Relative frequency of car prices')
plt.show()

In [None]:
# using Pandas plot()
df.plot(
    kind = 'hist', figsize = (10,6),
    y = 'lnprice', 
    bins = 18,
    rwidth = 0.9, legend = False, 
    xlabel = 'log price in USD', title = 'Frequency by log prices')
plt.show();

### Regression analysis - lo(w)ess

We start with *loess* using first `ggplot` then `seaborn`. 

For `seaborn` it’s recommended to use a Jupyter/IPython interface in [matplotlib mode](https://ipython.readthedocs.io/en/stable/interactive/plotting.html) using the `%matplotlib inline` magic command. 

In [None]:
%matplotlib inline

In [None]:
sns.regplot(
    data = df,
    x = 'age', y = 'price', 
    marker= '.',
    fit_reg= True, lowess= True,
    line_kws = {'color': 'k'}
);

### Linear regressions

Tools: on of the most-known tools data scientists use for predictive analysis is `scikit-learn`. Here, however, we use the `statsmodels` library that allows users to explore data, estimate statistical models, and perform statistical tests. `Scikit-learn` is great for building all kinds of predictive machine learning models, including linear regression, but spends little effort on providing insights into the models themselves. That's why we turn to `statsmodels` instead. 

#### Model 0: lowess on age

Note: the result of a lo(w)ess regression depends on the tools used. The values calculated below will be different compared to those seen on the `seaborn` regplot output.

In [None]:
lowess = sm.nonparametric.lowess
y_hat_lowess = lowess(df.price, df.age)

In [None]:
y_hat_lowess[0:10]

In [None]:
y_hat_lowess = [x[1] for x in y_hat_lowess]
y_hat_lowess[0:10]


#### Model 1: Linear regression on age

We are building models by adding more and more explanatory variables. 

In [None]:
reg1 = smf.ols("price ~ age + agesq", data = df).fit(cov_type="HC0")

In [None]:
print(reg1.get_robustcov_results(cov_type='HC1').summary())

In [None]:
reg1.bic

In [None]:
reg1.params.map('{:,.0f}'.format)

Note:   
BIC = $n*ln(SSE/n)+k*ln(n)$

#### Model 2: We are expanding the base models by adding new explanatory variables

In [None]:
reg2 = smf.ols("price ~ age + agesq + odometer", data = df).fit(cov_type="HC0")
reg3 = smf.ols(
    "price ~ age + agesq + odometer + odometersq + LE + cond_excellent + cond_good + dealer",
    data = df,
).fit(cov_type="HC0")
reg4 = smf.ols(
    "price ~ age + agesq + odometer + odometersq + LE + XLE + SE + cond_likenew + cond_excellent + cond_good + cylind6 + dealer",
    data = df,
).fit(cov_type="HC0")
reg5 = smf.ols(
    "price ~ age + agesq + odometer + odometersq + LE * age + XLE * age + SE * age + cond_likenew * age + cond_excellent * age + cond_good * age + cylind6 * age + odometer * age + dealer * age",
    data = df,
).fit(cov_type="HC0")

In [None]:
models = [reg1, reg2, reg3, reg4, reg5]
robustcov_results=[]

for i, model in enumerate(models):
    result=model.get_robustcov_results(cov_type='HC1').summary()
    robustcov_results.append(result)
    print()
    print(f'Regression: reg{i+1}')
    print(result)

In [None]:
stargazer.Stargazer([reg1])

In [None]:
plt.plot(df.age, reg1.predict(), color = 'steelblue', linestyle = '-')
plt.plot(df.age, y_hat_lowess, color = 'k', linestyle = "--")
plt.legend(labels = ['regression 1', "statsmodel's lowess"], labelcolor = ['steelblue', 'black'])
plt.title("Regression: model 1 vs statsmodel's lowess");

In [None]:
bic = [round(x.bic, 2) for x in [reg1,reg2,reg3,reg4,reg5]]
sg = stargazer.Stargazer([reg1,reg2,reg3,reg4,reg5])
sg.add_line('BIC', bic, location=stargazer.LineLocation.FOOTER_BOTTOM)
sg

How to tailor-make `Stargazer` output see [here](https://github.com/StatsReporting/stargazer/blob/master/examples.ipynb). 

#### Model 2: Linear Regression with cross validation

In [None]:
from sklearn.model_selection import KFold
k = KFold(n_splits=4, shuffle=False, random_state=None)

In [None]:
type(k.split(df))

`Generator functions` are a special kind of function that return a [lazy iterator](https://en.wikipedia.org/wiki/Lazy_evaluation). These are objects that you can loop over like a list. However, unlike lists, lazy iterators do not store their contents in memory.

The `split()` method generates indices to split data into training and test set. It returns the training and the test set indices for that split.

In [None]:
n = 0
for train_index, test_index in k.split(df):
    print(f'Split {n}: \n')
    print(train_index, '\n', '\n', test_index, '\n', '-'*70, '\n')
    n += 1

In [None]:
### Cross validate OLS with combining sklearn k-fold cross validation and statsmodels ols formula


def cv_reg(formula, df, kfold, robustse=None):
    regression_list = []
    predicts_on_test = []
    rsquared = []
    rmse_list = []

    # Calculating OLS for each fold

    for train_index, test_index in k.split(df):
        df_train, df_test = df.iloc[train_index, :], df.iloc[test_index, :]
        if robustse is None:
            model = smf.ols(formula, data = df_train).fit()
        else:
            model = smf.ols(formula, data = df_train).fit(cov_type=robustse)
        regression_list += [model]
        predicts_on_test += [model.predict(df_test)]
        rsquared += [model.rsquared]
        rmse_list += [rmse(df_train[formula.split("~")[0]], model.predict())]

    return {
        "regressions": regression_list,
        "test_predict": predicts_on_test,
        "r2": rsquared,
        "rmse": rmse_list,
    }


def summarize_cv(cvlist, stat="rmse"):
    result = pd.DataFrame(
        {"Model" + str(x + 1): cvlist[x][stat] for x in range(len(cv_list))}
    )
    result["Resample"] = ["Fold" + str(x + 1) for x in range(len(cvlist[0]["rmse"]))]
    result = result.set_index("Resample")
    result = pd.concat([result, pd.DataFrame(result.mean(), columns=["Average"]).T])
    return result

In [None]:
cv1 = cv_reg("price~age+agesq", df, k, "HC0")
cv2 = cv_reg("price~age+agesq+odometer", df, k, "HC0")
cv3 = cv_reg(
    "price~age+agesq+ odometer + odometersq + LE + cond_excellent + cond_good + dealer",
    df,
    k,
    "HC0",
)
cv4 = cv_reg(
    "price~age+agesq+ odometer + odometersq + LE + XLE + SE + cond_likenew + cond_excellent + cond_good + cylind6 + dealer",
    df,
    k,
    "HC0",
)
cv5 = cv_reg(
    "price~age+agesq + odometer + odometersq + LE*age + XLE*age + SE*age + cond_likenew*age + cond_excellent*age + cond_good*age + cylind6*age + odometer*age + dealer*age",
    df,
    k,
    "HC0",
)

In [None]:
cv1

In [None]:
cv_list = [cv1, cv2, cv3, cv4, cv5]

In [None]:
summarize_cv(cv_list).style.format('{:,.1f}')

### Prediction

In [None]:
df = df[
    [
        "age",
        "agesq",
        "odometer",
        "odometersq",
        "SE",
        "LE",
        "XLE",
        "cond_likenew",
        "cond_excellent",
        "cond_good",
        "dealer",
        "price",
        "cylind6"
    ]
]

In [None]:
df.dtypes

In [None]:
new = pd.DataFrame(pd.Series({
    "age":10,
    "agesq":10**2,
    "odometer":12,
    "odometersq":12**2,
    "SE":0,
    "LE":1,
    "XLE":0,
    "cond_likenew":0,
    "cond_excellent":1,
    "cond_good":0,
    "dealer":0,
    "price":np.nan,
    "cylind6":0
})).T
new

In [None]:
reg1.resid.describe().map('{:,.0f}'.format)

In [None]:
p1=reg1.get_prediction(new).summary_frame()

In [None]:
p1.map('{:,.0f}'.format)

In [None]:
reg3.resid.describe().map('{:,.0f}'.format)

In [None]:
p3=reg3.get_prediction(new).summary_frame()

In [None]:
p3.map('{:,.0f}'.format)

In [None]:
#get model3 rmse
rmse(reg3.fittedvalues,df.price)

In [None]:
pd.DataFrame(
    {
        " ": ["Predicted", "PI_low(95%)", "PI_high(95%)"],
        "Model1": p1[["mean", "obs_ci_lower", "obs_ci_upper"]].values.tolist()[0],
        "Model3": p3[["mean", "obs_ci_lower", "obs_ci_upper"]].values.tolist()[0],
    }
).set_index(" ").map('{:,.0f}'.format)

In [None]:
# summary of predictions and PI 80% version
p1=reg1.get_prediction(new).summary_frame(alpha=0.2)
p3=reg3.get_prediction(new).summary_frame(alpha=0.2)

pd.DataFrame(
    {
        " ": ["Predicted", "PI_low(80%)", "PI_high(80%)"],
        "Model1": p1[["mean", "obs_ci_lower", "obs_ci_upper"]].values.tolist()[0],
        "Model3": p3[["mean", "obs_ci_lower", "obs_ci_upper"]].values.tolist()[0],
    }
).set_index(" ").map('{:,.0f}'.format)