# Exploring Data using GLMs

In [1]:
import pandas as pd

import plotly.express as px
PLOTLY_THEME = 'plotly_white'

import statsmodels.api as sm

### Helper functions

In [10]:
def prepare_glm_data(data:pd.DataFrame) -> pd.DataFrame:
    _data = data.copy()
    bool_cols = _data.select_dtypes(include='bool').columns
    _data[bool_cols] = _data[bool_cols].astype(int)
    _data = sm.add_constant(_data)
    return _data

def evaluate_model(model:sm.GLM, target_values):
    print(f'Fitted GLM explains {model.pseudo_rsquared():.2%} of total variance.')
    print(f'Deviance: {model.deviance:,.2f}')
    fig = px.scatter(model.fittedvalues - target_values, labels={'index':'index', 'value':'residual'}, title=f'Residuals Plot', opacity=.5, template=PLOTLY_THEME)
    fig.update_layout(showlegend=False)
    fig.show()

### Data Load

In [3]:
DATA_PATH = r'C:/Users/Nick/Documents/Data Science/Personal projects/car-price-prediction/data/clean/'
FILE_NAME = 'data_clean_20240509.csv'

data = pd.read_csv(DATA_PATH+FILE_NAME, sep=';')
# print(f'{data.shape[0]} rows, {data.shape[1]} attributes')

### High Level Visualizations

In [5]:
fig = px.histogram(data, 'Price', histnorm='probability density', opacity=0.8, template=PLOTLY_THEME)
fig.show()

In [175]:
for column in data.columns[1:]:
    if data[column].dtype in ['int64', 'float64']:
        fig = px.scatter(data, column, 'Price', opacity=0.5, template=PLOTLY_THEME, title=f'{column} vs Price')
        fig.show()
    else:
        fig = px.box(data, column, 'Price', template=PLOTLY_THEME, title=f'{column} vs Price')
        fig.show()

### Prepare data

In [4]:
X = data.drop('Price', axis=1)
y = data['Price']

X_dummies = pd.get_dummies(X)

### Fit the model

In [5]:
sm.families.family.Gamma.links

[statsmodels.genmod.families.links.Log,
 statsmodels.genmod.families.links.Identity,
 statsmodels.genmod.families.links.InversePower]

In [6]:
glm = sm.GLM(y, prepare_glm_data(X_dummies), family=sm.families.Gamma()).fit()



In [11]:
evaluate_model(glm, y)

Fitted GLM explains 31.28% of total variance.
Deviance: 8,371.07


### Remove independent variables with p-value greater than 5%

In [12]:
threshold = .05
cols_to_drop = glm.pvalues[glm.pvalues >= threshold].index.values
X_dummies_dropped = X_dummies.drop(cols_to_drop, axis=1)

### Re-fit the model

In [13]:
glm_dropped = sm.GLM(y, prepare_glm_data(X_dummies_dropped), family=sm.families.Gamma(sm.families.links.InversePower())).fit()


The InversePower link function does not respect the domain of the Gamma family.



### Compare

In [14]:
evaluate_model(glm, y)
evaluate_model(glm_dropped, y)

Fitted GLM explains 31.28% of total variance.
Deviance: 8,371.07


Fitted GLM explains 32.85% of total variance.
Deviance: 8,286.24
