# <center>Class 9: Generalizing Results of a Regression </center>

In [None]:
import os
import sys
import warnings
from typing import List
import copy
import datetime

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.nonparametric.kernel_regression as loess
from stargazer.stargazer import Stargazer

import matplotlib.pyplot as plt
import seaborn as sns
from utils import lspline 

warnings.filterwarnings('ignore')

In [None]:
# !pip install stargazer

In [None]:
%matplotlib inline

## Data - Earnings

We are building multiple models on a subset of the data to explain why people earn as much as they do in a certain profession. After specifying some possible explanatory relationships we select the best performing one based on $R^2$. We visualize the predicted values and the uncertainty around them: the *confidence* and the *prediction intervals*. 

We then measure the performance of our best model on another subset of the earnings data.

In [None]:
path = os.path.join(os.pardir, 'data', 'morg-2014-emp.csv') # this will produce a path with the right syntax for your operating system
path

In [None]:
df_earnings =  pd.read_csv(path, index_col = 0)

In [None]:
df_earnings

In [None]:
df_earnings.info()

**Note**: important variables

- age: age (numeric)
- sex: gender (binary)
- earnwke: weakly earnings (numeric)
- uhourse: usual work hours (numeric)
- occ2012: occupational code (categorical)
- grade92: highest education grade completed (categorical)

Occupational classification (census 2010): https://www.bls.gov/cps/cenocc2010.htm   
Labels: http://data.nber.org/morg/docs/cpsx.pdf

### Filtering

**Occupation**: '_market research analyst_' (census code: 0735) & '_Computer and Mathematical Occupations_' (1000-1240)

Market research analysts will be our _sample 1_ and computer & mathematical occupations will be _sample 2_.

In [None]:
df_earnings['sample'] = 0

In [None]:
df_earnings.loc[df_earnings.occ2012 == 735, 'sample'] = 1
df_earnings.loc[((df_earnings.occ2012 >= 1005) & (df_earnings.occ2012 <= 1240)),'sample'] = 2

In [None]:
df_earnings['sample'].value_counts()

In [None]:
df_earnings = df_earnings[df_earnings['sample'] > 0]

In [None]:
df_earnings['sample'].value_counts()

### Feature Engineering

In [None]:
df_earnings['female'] = df_earnings.sex == 2

In [None]:
df_earnings['female']

In [None]:
df_earnings['w'] = df_earnings.earnwke / df_earnings.uhours # hourly wage
df_earnings['lnw'] = np.log(df_earnings.w)
df_earnings['agesq'] = np.power(df_earnings.age, 2)

Create two datasets:
- df_1 for market researchers
- df_2 for math and CS occupations

In [None]:
df_1 = df_earnings[df_earnings['sample'] == 1]
df_2 = df_earnings[df_earnings['sample'] == 2]

In [None]:
df_1[['earnwke', 'uhours', 'w', 'age']].describe().T.map('{:,.1f}'.format)

In [None]:
df_1.female.value_counts()

In [None]:
df_2.female.value_counts()

In [None]:
df_1.sort_values(by = 'age', ascending= True, inplace = True) # for plotting purpuses only
df_1.reset_index(drop = True, inplace = True)

In [None]:
df_2.sort_values(by = 'age', ascending= True, inplace = True) # for plotting purpuses only
df_2.reset_index(drop = True, inplace = True)

### EDA

We are exploring market research analysts. What are the key factors in determing wages in this profession?

In [None]:
sns.histplot(df_1.w, shrink=0.9, bins = range(0,95,5))
plt.xticks(range(0,95,5))
plt.xlabel('USD')
plt.title('Hourly wages');

In [None]:
sns.boxplot(data = df_1, x = 'female', y = 'w')
plt.xlabel(None)
plt.title('Wage distribution, males vs females');

In [None]:
sns.regplot(
    data = df_1, x = 'age', y = 'lnw', 
    lowess = True, line_kws = {'color' : 'k'})
plt.title('Age vs wage with lowess');

How can we see whether composition effects are in play resulting in slightly lower wages for women? 

## Regressions

Because in a cross-sectional dataset heteroscedastic residuals my be a problem, we - as usually - define the appropriate covariance matrix. 

#### Baseline: wage vs sex

In [None]:
reg_1 = smf.ols(formula = 'lnw ~ female', data = df_1).fit(cov_type = 'HC0')

In [None]:
print(reg_1.summary())

In [None]:
print(f'R2: {reg_1.rsquared:.4f}')

**Presenting regression results**: the `stargazer` package produces neatly formatted, customizable regression summaries.

In [None]:
Stargazer([reg_1])

#### Effect of age on wage

**linear age**

In [None]:
reg_2 = smf.ols(formula = 'lnw ~ age', data = df_1).fit(cov_type = 'HC0')

In [None]:
print(reg_2.summary())

**quadratic term in age**

In [None]:
reg_3 = smf.ols(formula = 'lnw ~ age + agesq', data = df_1).fit(cov_type = 'HC0')

In [None]:
print(reg_3.summary())

**Questions**: 

- How do we interpret _aqesq_?
- What does the second footnote mean?

**linear spline in age**

In [None]:
reg_4 = smf.ols(formula = 'lnw ~ lspline(age, [30,40])', data = df_1).fit(cov_type = 'HC0')

In [None]:
print(reg_4.summary())

#### Comparing regression models

In [None]:
stargazer = Stargazer([reg_1, reg_2, reg_3, reg_4])
stargazer.rename_covariates(
    {
        'Intercept': 'Constant',
        'agesq': 'age squared',
        'female[T.True]': 'female',
        'lspline(age, [30, 40])[0]': 'age spline < 30',
        'lspline(age, [30, 40])[1]': 'age spline 30–40',
        'lspline(age, [30, 40])[2]': 'age spline > 40',
    }
)
stargazer

#### Comparing regression lines

**add lowess**

In [None]:
reg_5 = sm.nonparametric.lowess
y_hat_lowess = reg_5(df_1.lnw, df_1.age)
y_hat_lowess[0:10]

In [None]:
y_hat_lowess = [x[1] for x in y_hat_lowess]

In [None]:
y_hat_age = reg_2.predict()
y_hat_agesq = reg_3.predict()
y_hat_spline = reg_4.predict()

In [None]:
fig, ax = plt.subplots()
ax.plot(df_1.age, y_hat_lowess, color = 'k', linestyle = '-.', label = 'lowess')
ax.plot(df_1.age, y_hat_agesq, color = 'k', linestyle = ':', label = 'polinomial')
ax.plot(df_1.age, y_hat_spline, color = 'k', linestyle = '-', label = 'linear spline')
plt.legend()
plt.xlabel('years')
plt.ylabel('log wage')
plt.title('Regression model predictions/fitted values');

### Confidence and prediction intervals

Our best-performing model is the regression with linear spline (reg_4). We are using this model for getting the _confidence_ and the _prediction_ intervals.  

In [None]:
df_summary_frame = reg_4.get_prediction().summary_frame()

In [None]:
df_summary_frame

**Confidence interval**

Remember: `confidence interval` is the measure of uncertertainty regarding the _conditional expected value_ of log price. The word _condtional_ refers to a fact that we are looking for the expected value of log price for a _given age_ (expected log price conditional on age). This is NOT the area where our datapoint should lie, as the value of an individual observation depends on the _general patter_ AND _noise_. This is the **uncertainty of the general pattern**. This uncertainty stems from the fact that we only have a sample of all possible members of the underlying population, and this sample is, by definition, noisy. This noisiness of the sample leads to uncertainty regarding the general pattern. 

In [None]:
fig, ax = plt.subplots()
ax.scatter(df_1.age, df_1.lnw, s = 3)
ax.plot(df_1.age, df_summary_frame['mean'], color = 'k', linestyle = '-')
ax.fill_between(df_1.age, df_summary_frame.mean_ci_lower, df_summary_frame.mean_ci_upper, color = 'darkblue', alpha = 0.5)
plt.title('Spline regression fitted values with confidence intervals')
plt.xlabel('age')
plt.ylabel('log wage');

**Prediction interval**

The `prediction interval` is the uncertainty around the *individual observation*. Since $data = pattern + noise$, the value of individual observations will have additional uncertainty on top of the uncertain pattern. This is why the prediction interval is way wider than the confidence interval. 

In summary:

<center>
    \begin{equation}
    \text{prediction  interval = uncertain pattern + additional uncertainty from noisiness}
    \end{equation}
</center>

In [None]:
fig, ax = plt.subplots()
ax.scatter(df_1.age, df_1.lnw, s = 3)
ax.plot(df_1.age, df_summary_frame['mean'], color = 'k', linestyle = '-')
ax.fill_between(df_1.age, df_summary_frame.obs_ci_lower, df_summary_frame.obs_ci_upper, color = 'darkblue', alpha = 0.5)
plt.title('Spline regression fitted values with prediction intervals')
plt.xlabel('age')
plt.ylabel('log wage');

**Confidence interval for the polinomial model**

In [None]:
df_summary_frame_poli = reg_3.get_prediction().summary_frame()

In [None]:
fig, ax = plt.subplots()
ax.scatter(df_1.age, df_1.lnw, s = 3)
ax.plot(df_1.age, df_summary_frame_poli['mean'], color = 'k', linestyle = '-')
ax.fill_between(df_1.age, df_summary_frame_poli.mean_ci_lower, df_summary_frame_poli.mean_ci_upper, color = 'darkblue', alpha = 0.5)
plt.title('Polinomial regression fitted values with confidence intervals')
plt.xlabel('age')
plt.ylabel('log wage');

### External validity

How does the spline model perform on the computer science dataset?

In [None]:
reg_4_2 = smf.ols(formula = 'lnw ~ lspline(age, [30,40])', data = df_2).fit(cov_type = 'HC0')

In [None]:
print(reg_4_2.summary())

In [None]:
print(f'Spline model R2 on the market research dataset: {reg_4.rsquared:.3f}')
print(f'Spline model R2 on the computer science dataset: {reg_4_2.rsquared:.3f}')

<br>

And how about the polinomial model?

In [None]:
reg_3_2 = smf.ols(formula = 'lnw ~ age + agesq', data = df_2).fit(cov_type = 'HC0')

In [None]:
print(reg_3_2.summary())

In [None]:
print(f'Polinomial model R2 on the market research dataset: {reg_3.rsquared:.3f}')
print(f'Polinomial model R2 on the computer science dataset: {reg_3_2.rsquared:.3f}')

#### Putting it all together: regressions in the two datasets

In [None]:
df_r2 = pd.DataFrame(
    data = [
        [reg_3.rsquared, reg_4.rsquared],
        [reg_3_2.rsquared, reg_4_2.rsquared]
    ], 
    columns = ['polinomial', 'spline'],
    index = ['market research', 'computer science']
).map('{:.3f}'.format)
df_r2.index.name = 'R2'
df_r2

## Data - Hotels

We are checking the *external validity* of our model on Vienna hotels prices in November 2017 in two dimensions:

- same city, different points in time, and
- different cities, same time.

In [None]:
path_prices = os.path.join(os.pardir, 'data', 'hotels_europe_price.csv')
path_prices

In [None]:
path_features = os.path.join(os.pardir, 'data', 'hotels_europe_features.csv')
path_features

In [None]:
df_hotels_europe_price = pd.read_csv(path_prices, index_col = 0)

In [None]:
df_hotels_europe_price

In [None]:
df_hotels_features = pd.read_csv(path_features, index_col = 0)

In [None]:
df_hotels_features

In [None]:
df_hotels = pd.merge(
    df_hotels_europe_price, df_hotels_features, 
    how = 'left', left_on = 'hotel_id', right_on = 'hotel_id')

Note: since both dataframes has 'hotel_is', on = 'hotel_id' would also work.

```python
df_hotels = pd.merge(df_hotels_europe_price, df_hotels_features, how = 'left', on = 'hotel_id')
```
<br>

**Question**: What is the SQL-equivalent of dataframe merge? Call the two tables HOTELS_PRICES and HOTELS_FEATURES.

```SQL
SELECT * 
FROM HOTELS_PRICES
LEFT JOIN HOTELS_FEATURES
ON HOTELS_PRICES.hotel_id = HOTELS_FEATURES.hotel_id
```
<br>


### Feature engineering

In [None]:
df_hotels

In [None]:
df_hotels.info()

In [None]:
df_hotels = df_hotels[df_hotels.city_actual.isin(['Vienna', 'Amsterdam', 'Barcelona'])]

In [None]:
df_hotels.sort_values(by = 'distance', inplace = True) # plotting

In [None]:
df_hotels.shape

In [None]:
df_hotels = df_hotels[df_hotels.accommodation_type.isin(['Hotel', 'Apartment'])]

In [None]:
df_hotels = df_hotels[df_hotels.nnights !=4]

In [None]:
df_hotels = df_hotels[df_hotels.price < 1000]

In [None]:
df_hotels = df_hotels.drop_duplicates()

In [None]:
df_hotels.loc[(df_hotels['month'] == 11) & (df_hotels['weekend'] == 0), 'date'] = '2017-NOV-weekday'
df_hotels.loc[(df_hotels['month'] == 11) & (df_hotels['weekend'] == 1), 'date'] = '2017-NOV-weekend'
df_hotels.loc[(df_hotels['month'] == 12) & (df_hotels['holiday'] == 1), 'date'] = '2017-DEC-holiday'
df_hotels.loc[(df_hotels['month'] == 6) & (df_hotels['weekend'] == 1), 'date'] = '2018-JUNE-weekend'

In [None]:
df_hotels = df_hotels[df_hotels.date.notna()]

In [None]:
df_hotels.city.value_counts()

In [None]:
pd.crosstab(index = df_hotels.accommodation_type, columns = df_hotels.city)

In [None]:
pd.crosstab(index = df_hotels.date, columns = df_hotels.city)

In [None]:
df_hotels['lnprice'] = np.log(df_hotels.price)

In [None]:
df_hotels = df_hotels[
    [
        'hotel_id',
        'date',
        'city',
        'accommodation_type',
        'stars',
        'rating',
        'distance',
        'price',
        'lnprice',
    ]
]

In [None]:
df_hotels.head()

### External validity of regression models: same city, various dates

Note: we are using linear splines.

#### EDA: Vienna only

In [None]:
df_vienna = df_hotels.loc[
    (df_hotels['stars'] >= 3)
    & (df_hotels['stars'] <= 4)
    & (df_hotels['accommodation_type'] == 'Hotel')
    & (df_hotels['city'] == 'Vienna')
]

In [None]:
df_vienna.date.value_counts()

In [None]:
df_vienna[['distance', 'price', 'lnprice']].describe().T.round(2)

In [None]:
df_vienna.groupby('date')['distance'].describe().round(2)

In [None]:
df_vienna.groupby('date')['price'].describe().round(2)

In [None]:
df_vienna.groupby('date')['lnprice'].describe().round(2)

#### Regressions

Note: we are using linear splines.

In [None]:
dates = [
    '2017-NOV-weekday',
    '2017-NOV-weekend',
    '2017-DEC-holiday',
    '2018-JUNE-weekend',
]

In [None]:
ls_models = []

for date in dates:
    ls_models.append(
        smf.ols(
            formula = 'lnprice ~ lspline(distance, 2)', 
            data = df_vienna[df_vienna.date == date]
        ).fit(cov_type = 'HC0')
    )

In [None]:
ls_models

In [None]:
stargazer = Stargazer(ls_models)

In [None]:
stargazer

Customizing Stargazer output.

In [None]:
stargazer.covariate_order(
    ['lspline(distance, 2)[0]', 'lspline(distance, 2)[1]', 'Intercept']
)
stargazer.rename_covariates(
    {
        'Intercept': 'Constant',
        'lspline(distance, 2)[0]': 'Distance spline <2',
        'lspline(distance, 2)[1]': 'Distance spline 2–7',
    }
)
stargazer.custom_columns(dates, [1, 1, 1, 1])
stargazer.show_model_numbers(False)
stargazer

### External validity of regression models: same date, various cities

Still linear splines.

In [None]:
df_cities = df_hotels.loc[
    lambda x: (x['stars'] >= 3)
    & (x['stars'] <= 4)
    & (x['date'] == '2017-NOV-weekday')
    & (x['accommodation_type'] == 'Hotel')
]

#### EDA: Weekday in November only

In [None]:
pd.crosstab(index=df_cities['city'], columns = df_cities['stars'])

In [None]:
df_cities.groupby('stars')['distance'].describe().round(3)

In [None]:
df_cities.groupby('city')['distance'].describe().round(3)

In [None]:
df_cities.groupby('city')['price'].describe().round(3)

#### Regressions

In [None]:
ls_cities = ['Vienna', 'Amsterdam', 'Barcelona']

In [None]:
ls_models = []

for city in ls_cities:
    ls_models.append(
        smf.ols(
            formula = 'lnprice ~ lspline(distance, 2)', 
            data = df_cities[df_cities.city == city]
        ).fit(cov_type = 'HC0')
    )

In [None]:
stargazer = Stargazer(ls_models)
stargazer.rename_covariates(
    {
        'Intercept': 'Constant',
        'lspline(distance, 2)[0]': 'Distance spline <2',
        'lspline(distance, 2)[1]': 'Distance spline 2–7',
    }
)
stargazer.custom_columns(ls_cities, [1, 1, 1])
stargazer.show_model_numbers(False)
stargazer

### External validity: Hotels vs apartments in Vienna on a weekday in November

#### EDA: Hotels vs apartments

In [None]:
df_hotels_apartments = df_hotels[
    (df_hotels.stars >= 3)
    & (df_hotels.stars <= 4)
    & (df_hotels.date == '2017-NOV-weekday')
    & (df_hotels.city == 'Vienna')]

In [None]:
pd.crosstab(
    index = df_hotels_apartments.accommodation_type, 
    columns = df_hotels_apartments.stars, 
    margins = True)

In [None]:
df_hotels_apartments.groupby('stars')['price'].describe().round(1)

In [None]:
df_hotels_apartments.groupby('accommodation_type')['price'].describe().round(1)

In [None]:
df_hotels_apartments.groupby('accommodation_type')['distance'].describe().round(2)

In [None]:
# This is a bit more difficult to read but provides meaningful insight into the differences in pricing.
df_hotels_apartments.groupby(['stars','accommodation_type'])['price'].describe().round(1)

**A bird-eye view of the differences betweeen the two subsets**

In [None]:
sns.lmplot(
    data = df_hotels_apartments, 
    x = 'distance', y = 'lnprice', hue = 'accommodation_type', 
    height = 5, aspect = 5/4,
    ci = None,
    scatter_kws = {'s': 6}, line_kws = {'linewidth': 2},
    palette = {'Hotel': 'k', 'Apartment': 'indianred'}, legend = False
)
plt.legend(labelcolor = ['k', 'indianred'], frameon = False)
plt.title('Hotel and apartment prices in Vienna, Nov, 2017');

#### Regressions

In [None]:
ls_acc_types = ['Hotel', 'Apartment']

In [None]:
ls_models = []

for acc_type in ls_acc_types:
    ls_models.append(
        smf.ols(
            formula = 'lnprice ~ lspline(distance, 2)', 
            data = df_hotels_apartments[df_hotels_apartments.accommodation_type == acc_type]
        ).fit(cov_type = 'HC0')
    )

In [None]:
stargazer = Stargazer(ls_models)
stargazer.rename_covariates(
    {
        'Intercept': 'Constant',
        'lspline(distance, 2)[0]': 'Distance spline <2',
        'lspline(distance, 2)[1]': 'Distance spline 2–7',
    }
)
stargazer.custom_columns(ls_acc_types, [1, 1])
stargazer.show_model_numbers(False)
stargazer

In [None]:
ls_models

In [None]:
ls_hotels_predicted = ls_models[0].predict()

In [None]:
ls_apartments_predicted = ls_models[1].predict()

In [None]:
fig, ax = plt.subplots()

# hotels fitted prices
ax.plot(
    df_hotels_apartments[df_hotels_apartments.accommodation_type == 'Hotel'].distance, 
    ls_hotels_predicted, 
    color = 'k', linestyle = '-.',  label = 'hotels'
)

# hotels original prices
ax.scatter(
    df_hotels_apartments[df_hotels_apartments.accommodation_type == 'Hotel'].distance, 
    df_hotels_apartments[df_hotels_apartments.accommodation_type == 'Hotel'].lnprice,
    color = 'k', s = 2)

# apartments fitted prices
ax.plot(
    df_hotels_apartments[df_hotels_apartments.accommodation_type == 'Apartment'].distance, 
    ls_apartments_predicted, 
    color = 'indianred', linestyle = '-',  label = 'apartments')

# apartments original prices
ax.scatter(
    df_hotels_apartments[df_hotels_apartments.accommodation_type == 'Apartment'].distance, 
    df_hotels_apartments[df_hotels_apartments.accommodation_type == 'Apartment'].lnprice,
    color = 'indianred', s = 3)

plt.legend(labelcolor = ['k', 'indianred'])
plt.xlabel('distance in miles')
plt.ylabel('log price')
plt.title('Regression model fitted values vs original data');

**Question**: What does it mean, that in case of hotels the second spline component is insignificant? How does it show up on this plot?