# <center>Class 8: Complicated Patterns </center>

In [None]:
import os
import sys
import warnings
from typing import List
import copy

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import math

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

In [None]:
%matplotlib inline

## Data - Hotels

In [None]:
path = os.path.join(os.pardir, 'data', 'hotels-vienna.csv') # this will produce a path with the right syntax for your operating system
path

In [None]:
df_hotels = pd.read_csv(path)

In [None]:
df_hotels

In [None]:
df_hotels = df_hotels[
    (df_hotels.accommodation_type == 'Hotel')
    & (df_hotels.city_actual == 'Vienna')
    & (df_hotels.stars >= 3)
    & (df_hotels.stars <= 4)
    & (df_hotels.price <= 600)]


In [None]:
df_hotels.shape

Adding log variables.

In [None]:
df_hotels["lnprice"] = np.log(df_hotels["price"])
df_hotels["distance2"] = df_hotels["distance"]
df_hotels.loc[df_hotels["distance2"] < 0.05, "distance2"] = 0.05 # making sure that the log transformation is feasible
df_hotels["lndistance"] = np.log(df_hotels["distance2"])

In [None]:
df_hotels.price.describe()

In [None]:
df_hotels.lnprice.describe()

In [None]:
df_hotels.sort_values(by = 'distance', ascending = True, inplace = True) # we only do it for plotting purposes, linear regression does not require sorted data

<br> 

As a reminder: prices and distances are not normally distributed. **Why normal distribution is important?**

In [None]:
df_hotels.price.plot(
    kind = 'hist',  bins = range(50, 425, 25), rwidth = 0.9
    , figsize = (8,5)
    , xticks = range(50, 425, 25)
    , xlabel = 'USD'
    , title = 'Distribution of room prices'
);

In [None]:
df_hotels.lnprice.plot(
    kind = 'hist',  rwidth = 0.9, bins = 11
    , figsize = (8,5)
    , xticks = [x/100 for x in range(375, 650, 25)]
    , xlabel = 'log of USD prices'
    , title = 'Distribution of log room prices'
);

In [None]:
df_hotels.distance.plot(
    kind = 'hist',  bins = [i/10 for i in range(0, 70, 5)], rwidth = 0.9
    , figsize = (8,5)
    , xticks = [i/10 for i in range(0, 70, 5)]
    , xlabel = 'miles'
    , title = 'Distances from the city center'
);

In [None]:
df_hotels.lndistance.plot(
    kind = 'hist',  bins = 20, rwidth = 0.9
    , figsize = (8,5)
    , xlabel = 'log miles'
    , title = 'Log distances from the city center'
    , xticks = [x/10 for x in range(-30, 21, 5)]
);

In [None]:
#or with seaborn
sns.histplot(df_hotels.distance, bins = 20, log_scale=True)
plt.xlabel('log miles')
plt.ylabel('Frequency')
plt.title('Log distances from the city center')
plt.show()


## Regression: Comparing Linear and Non-Linear Patterns

**price vs distance**

In [None]:
regression_1 = smf.ols('price ~ distance', data = df_hotels).fit(cov_type = 'HC0')

We are quantifying a linear relationship between price and distance. The regression line in the plot below is what `seaborn` estimates and not the fitted values from or regression. The two lines, however, should match, as there is only one solution to the error minimization problem.

In [None]:
sns.regplot(
    data = df_hotels, x = 'distance', y = 'price'
    , marker = '.', fit_reg = True , ci = None
    , scatter_kws = {'color': 'royalblue'}
    , line_kws = {'color': 'k'}
)
plt.xlabel('distance in miles')
plt.ylabel('price in USD')
plt.title('Vienna hotel prices vs distances from city center');

This is how our model actually looks like.

In [None]:
print(regression_1.summary())

In [None]:
regression_1.pvalues

In [None]:
print('The the p-value of distance variable: {:.10f}'.format(regression_1.pvalues.distance))

In [None]:
regression_1.tvalues.distance

In [None]:
regression_1.fvalue

In [None]:
regression_1.tvalues.distance**2

<br> 

**price vs log distance**

In [None]:
regression_2 = smf.ols('price ~ lndistance', data = df_hotels).fit(cov_type = 'HC0')

In [None]:
sns.regplot(
    data = df_hotels, x = 'lndistance', y = 'price'
    , marker = '.', fit_reg = True, ci = None
    , scatter_kws = {'color': 'dimgrey'}
    , line_kws = {'color': 'k'}
)
plt.xlabel('log distance in miles')
plt.ylabel('price in USD')
plt.title('Vienna hotel prices vs log distances from city center');

In [None]:
print(regression_2.summary())

<br> 

**log price vs distance**

In [None]:
regression_3 = smf.ols('lnprice ~ distance', data = df_hotels).fit(cov_type = 'HC0')

In [None]:
sns.regplot(
    data = df_hotels, x = 'distance', y = 'lnprice'
    , marker = '.', fit_reg = True, ci = None
    , scatter_kws = {'color': 'dimgrey'}
    , line_kws = {'color': 'k'}
)
plt.xlabel('distance in miles')
plt.ylabel('log price in USD')
plt.title('Vienna hotel log prices vs distances from city center');

In [None]:
print(regression_3.summary())

<br> 

**log price vs log distance**

In [None]:
regression_4 = smf.ols('lnprice ~ lndistance', data = df_hotels).fit(cov_type = 'HC0')

In [None]:
sns.regplot(
    data = df_hotels, x = 'lndistance', y = 'lnprice'
    , marker = '.', fit_reg = True, ci = None
    , scatter_kws = {'color': 'dimgrey'}
    , line_kws = {'color': 'k'}
)
plt.xlabel('log distance in miles')
plt.ylabel('log price in US')
plt.title('Vienna hotel log prices vs log distances from city center');

In [None]:
print(regression_4.summary2())

<br>

- Now interpret the $\beta_0$ and $\beta_1$ parameters of the four regressions. 
- Which model has the best fit?
- Which model shall we use if we also consider the issues with interpretation?

**Visualizing the fitted values from the log-log regression in the original (non-transformed) variable space.**

In [None]:
price_hat = regression_4.fittedvalues

In [None]:
price_hat

<br> 

Fitted values from the regression are calculated for the *log prices*. We, however, are NOT interested in the log prices but in the original dollar values. For this we need to convert the fitted log values to the original units (dollars in this case). This conversion is less straightforward than you would think: we need to adjust $e^{\hat{ln y_i}}$ by a fuction of the standard deviation of the residual $\hat{\sigma}$ pf the regression model with $lny$ on its left-hand side.

<center>$\hat{y}_i = e^{\hat{ln y_i}}*e^{\sigma^2/2}$ </center>
<br>

This comes from the fact that the mean of the lognormal distribution is $e^{\mu+{\sigma^2/2}} = e^{\mu}*e^{\sigma^2/2}$


See more in Békés-Kézdi @3.9: mean of a lognormal distribution & @14.3 prediction from a log model

1. Get the residuals
2. Calculate their variance
3. Use it in the formula

In [None]:
regression_4.resid

In [None]:
residual_variance = np.var(regression_4.resid)
residual_variance

In [None]:
price_hat = [math.exp(x)*math.exp(residual_variance/2) for x in price_hat]

In [None]:
price_hat[0:10]

In [None]:
fig = plt.figure(figsize = (6,4))
ax = fig.add_axes([0,0,1,1])
ax.scatter(x = df_hotels.distance, y = df_hotels.price, color = 'cornflowerblue', s = 5, label = 'actual price')
ax.plot(df_hotels.distance, price_hat, color = 'k', label = 'fitted price')
plt.title('Actual and fitted hotel price values from the log-log regression')
plt.legend(labelcolor = ['cornflowerblue', 'black'])
plt.xlabel('distance in miles')
#plt.ylim(0,300)
plt.ylabel('price in USD');

**Question**: is this a _linear_ regression?

### Data - Life Expectancy & Income

In [None]:
path = os.path.join(os.pardir, 'data', 'worldbank-lifeexpectancy.csv') 
path

In [None]:
df_wb = pd.read_csv(path)

In [None]:
df_wb.head()

In [None]:
df_wb.info()

In [None]:
df_wb.year.unique()

In [None]:
df_wb = df_wb[df_wb.year == 2017]
df_wb

In [None]:
df_wb.sort_values(by = 'gdppc', inplace = True) # only for plotting purposes

In [None]:
df_wb.reset_index(drop = True, inplace = True)

In [None]:
df_wb["gdptot"] = df_wb["gdppc"] * df_wb["population"]
df_wb["lngdppc"] = np.log(df_wb["gdppc"])
df_wb["lngdptot"] = np.log(df_wb["gdptot"])

### EDA

In [None]:
df_wb[["lifeexp", "gdppc", "gdptot", "lngdppc", "lngdptot"]].describe().T.round(3)

In [None]:
sns.histplot(df_wb.lifeexp, bins = 31)
plt.xlabel('life expectancy in years')
plt.title('Distribution of life expectancy in 2017')
plt.text(x = 55, y = 15, s = 'definitely not lognormal');

In [None]:
sns.histplot(
    df_wb.gdppc
    , bins = range(0,125,5)
    , shrink = 0.90 # resize the columns to add gaps between them
)
plt.xlabel('thousand USD')
plt.text(40, 40, 'What distribution is this?')
plt.title('Distribution of GDP per capita in 2017');

In [None]:
sns.histplot(
    df_wb.gdppc
    , log_scale= True
    , bins = 24
    , shrink = 0.90
)
plt.xlabel('thousand USD')
plt.text(0.75, 12, 'using it as lognormal is OK')
plt.title('Distribution of GDP per capita in 2017');

In [None]:
sns.scatterplot(
    data = df_wb, x = 'gdppc', y = 'lifeexp')
plt.title('GDP per capita vs life expectancy')
plt.xlabel('thousand USD');
plt.ylabel('years');

Adding a regression line.

In [None]:
sns.regplot(
    data = df_wb, x = 'gdppc', y = 'lifeexp'
    , fit_reg= True, ci = None, line_kws = {'color': 'k'})
plt.title('GDP per capita vs life expectancy - linear model')
plt.xlabel('thousand USD');
plt.ylabel('years');

In [None]:
sns.regplot(
    data = df_wb, x = 'gdppc', y = 'lifeexp'
    , fit_reg= True, lowess= True, ci = None, line_kws = {'color': 'k'})
plt.title('GDP per capita vs life expectancy - lowess')
plt.xlabel('thousand USD');
plt.ylabel('years');

Regplot with log in GDP per capita

Original data, transformed x-axis.

In [None]:
g = sns.regplot(
    data = df_wb, x = 'gdppc', y = 'lifeexp'
    , fit_reg = True, ci = None
    , logx= True # here we tell seaborn to take x logs
    , line_kws = {'color': 'k'})
plt.semilogx() # we are changing the scale of the x axis to logarithmic
plt.title('Log GDP per capita vs life expectancy')
plt.xlabel('thousand USD, log scale');
plt.ylabel('years');

Reformat x-axis values to meaningful units.

In [None]:
g.get_xticks()

In [None]:
['{:,.0f}'.format(x) for x in g.get_xticks()]

In [None]:
g = sns.regplot(
    data = df_wb, x = 'gdppc', y = 'lifeexp'
    , fit_reg = True, logx= True, ci = None
    , line_kws = {'color': 'k'})
plt.semilogx() # we are changing the scale of the x axis to logarithmic
xlabels = ['{:,.0f}'.format(x) for x in g.get_xticks()] #  getting the x ticks and reformatting them as decimals
g.set_xticklabels(xlabels) # adding the reformatted ticks
plt.title('Log GDP per capita vs life expectancy')
plt.xlabel('thousand USD, log scale');
plt.ylabel('years');

The fitted values from the level-log regression look like this on the oringinal data.

In [None]:
# Here we are telling Seaborn to fit a regression to the logx data but we plot it in the original decimal space. 
g = sns.regplot(
    data = df_wb, x = 'gdppc', y = 'lifeexp'
    , fit_reg = True, logx= True, ci = None
    , line_kws = {'color': 'k'})
xlabels = ['{:,.0f}'.format(x) for x in g.get_xticks()]
g.set_xticklabels(xlabels)
plt.title('Log GDP per capita vs life expectancy \noriginal scale')
plt.xlabel('thousand USD');
plt.ylabel('years');

### Regressions: Logs, Splines & Polinomials

#### Logs

**level-level**

In [None]:
regression_life_1 = smf.ols(formula = 'lifeexp ~ gdppc', data = df_wb).fit(cov_type = 'HC0')

In [None]:
print(regression_life_1.summary())

**level-log**

In [None]:
regression_life_2 = smf.ols(formula = 'lifeexp ~ lngdppc', data = df_wb).fit(cov_type = 'HC0')

In [None]:
print(regression_life_2.summary())

#### Spline

For a spline we need helper functions.

In [None]:
cutoff = 50
cutoff_ln = np.log(cutoff)
cutoff_ln

In [None]:
def knot_ceil(vector: np.array, knot: float) -> np.array:
    """
    Apply a ceiling value to elements in a numpy array.

    This function takes a numpy array and a ceiling value (knot). It returns a copy of the array where all elements greater than the ceiling value are replaced by the ceiling value.

    Parameters:
    vector (np.array): The input numpy array.
    knot (float): The ceiling value to apply.

    Returns:
    np.array: A new numpy array with elements capped at the ceiling value.
    
    Example:
    >>> import numpy as np
    >>> vector = np.array([1, 2, 3, 4, 5])
    >>> knot = 3
    >>> knot_ceil(vector, knot)
    array([1, 2, 3, 3, 3])
    """
    
    vector_copy = copy.deepcopy(vector)
    vector_copy[vector_copy > knot] = knot
    return vector_copy


def lspline(series: pd.Series, knots: List[float]) -> np.array:
    """
    Generate a linear spline basis matrix for a given pandas Series and knots.

    This function creates a design matrix for linear splines based on the provided knots. 
    It iteratively applies a ceiling function to the series values at each knot, 
    subtracting the resulting column from the series to prepare for the next knot.

    Parameters:
    series (pd.Series): The input pandas Series.
    knots (List[float]): A list of knot values where the spline should change slope.

    Returns:
    np.array: A design matrix where each column corresponds to a segment of the linear spline.

    Example:
    >>> import pandas as pd
    >>> series = pd.Series([1, 2, 3, 4, 5])
    >>> knots = [2, 4]
    >>> lspline(series, knots)
    array([[1, 0, 0],
           [2, 0, 0],
           [2, 1, 0],
           [2, 2, 0],
           [2, 2, 1]])
    """
    
    if type(knots) != list:
        knots = [knots]
    design_matrix = None
    vector = series.values

    for i in range(len(knots)):
        if i == 0:
            column = knot_ceil(vector, knots[i])
        else:
            column = knot_ceil(vector, knots[i] - knots[i - 1])

        if i == 0:
            design_matrix = column
        else:
            design_matrix = np.column_stack((design_matrix, column))

        vector = vector - column
    design_matrix = np.column_stack((design_matrix, vector))

    return design_matrix

User-defined functions can directly be applied to statsmodels' formula definition! 

In [None]:
regression_spline = smf.ols(formula = 'lifeexp ~ lspline(lngdppc, cutoff_ln)', data = df_wb).fit(cov_tpe = 'HC0')

In [None]:
print(regression_spline.summary())

<br>

**Questions**: 

- What do the t-value and the p-value of $\beta_2$ suggest? How can we interpret $\beta_2$?
- Why did we pick USD 50K as cutoff? How can we find the best cutoff? 

In [None]:
df_wb['spline_prediction'] = regression_spline.predict()

In [None]:
g = sns.scatterplot(x = 'gdppc', y= 'lifeexp', data = df_wb)
plt.plot(df_wb.gdppc, df_wb.spline_prediction, color = 'k')
plt.semilogx()
xlabels = ['{:,.0f}'.format(x) for x in g.get_xticks()]
g.set_xticklabels(xlabels)
plt.title('Log GDP per capita vs life expectancy \npiecewise linear spline regression')
plt.xlabel('thousand USD, log scale');
plt.ylabel('years');

In [None]:
g = sns.scatterplot(x = 'gdppc', y= 'lifeexp', data = df_wb)
plt.plot(df_wb.gdppc, df_wb.spline_prediction, color = 'k')
# plt.semilogx()
xlabels = ['{:,.0f}'.format(x) for x in g.get_xticks()]
g.set_xticklabels(xlabels)
plt.title('GDP per capita vs life expectancy \npiecewise linear spline regression on log GDP')
plt.xlabel('thousand USD, original scale')
plt.ylabel('years');

In [None]:
df_wb[(df_wb.gdppc > 20) & (df_wb.lifeexp < 60)]

**Question**: what can be an explanation for thris outlier value?

#### Polinomial

In [None]:
df_wb['lngdppc_sq'] = df_wb.lngdppc.pow(2)

In [None]:
regression_quadratic = smf.ols(formula = 'lifeexp ~ lngdppc + lngdppc_sq', data = df_wb).fit(cov_tpe = 'HC0')

In [None]:
print(regression_quadratic.summary())

In [None]:
df_wb['quadratic_prediction'] = regression_quadratic.fittedvalues

In [None]:
g = sns.scatterplot(x = 'gdppc', y= 'lifeexp', data = df_wb)
plt.plot(df_wb.gdppc, df_wb.quadratic_prediction, color = 'k')
plt.semilogx()
xlabels = ['{:,.0f}'.format(x) for x in g.get_xticks()]
g.set_xticklabels(xlabels)
plt.title('Log GDP per capita vs life expectancy \nquadratic regression')
plt.xlabel('thousand USD, log scale');
plt.ylabel('years');

#### Weighted regression

We are using population weights to estimate the regression parameters.

In [None]:
regression_weighted = smf.wls(formula = 'lifeexp ~ lngdppc', data= df_wb, weights = df_wb.gdptot).fit(cov_type = 'HC0')

In [None]:
print(regression_weighted.summary())

In [None]:
g = sns.scatterplot(x = 'gdppc', y= 'lifeexp', data = df_wb, size = 'gdptot', sizes = (10,500))
plt.plot(df_wb.gdppc, regression_weighted.fittedvalues, color = 'k')
plt.semilogx()
xlabels = ['{:,.0f}'.format(x) for x in g.get_xticks()]
g.set_xticklabels(xlabels)
plt.title('Log GDP per capita vs life expectancy \nweighted regression')
plt.legend(labelspacing = 1)
plt.xlabel('thousand USD, log scale');
plt.ylabel('years');