In [1]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, LassoCV
import pandas as pd
import numpy as np
import regression_utils as mlu



## Simplifying the model
From the EDA phase we concluded that both `region` and `children` are hardly relevant predictors. Therefore we migh just drop it from our data

In [2]:
csvFilePath = './datasets_13720_18513_insurance.csv'
with open (csvFilePath, 'rb') as file:
    data = pd.read_csv(file, encoding = 'UTF-8',
                                    thousands = ',',
                                    decimal = '.',
                                    dtype = {
                                            'sex':'category',
                                            'smoker':'category',
                                            'region':'category',
                                            'children':'category',
                                            }
                                )

cols_to_drop = ['region', 'children']
data = data.drop(cols_to_drop, axis=1)

In [3]:
data.head()

Unnamed: 0,age,sex,bmi,smoker,charges
0,19,female,27.9,yes,16884.924
1,18,male,33.77,no,1725.5523
2,28,male,33.0,no,4449.462
3,33,male,22.705,no,21984.47061
4,32,male,28.88,no,3866.8552


## identify labels (aka. y, response_variable)


In [4]:
y_name = 'charges'
y = np.array(data[y_name])
data.drop(y_name, axis='columns', inplace=True)

## codify categorical features


In [5]:
dummies_names = mlu.gen_dummy_col_names(data)
dummies_inner_combinations = mlu.gen_dummy_cols_inner_combinations(dummies_names)
#dummies_names and dummies_inner_combinations will be used later on

X_cat = mlu.slice_categorical_features(data, y_name)
data.drop(X_cat.columns, axis='columns', inplace=True)
X_cat = pd.get_dummies(X_cat)
X_cat

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes
0,1,0,0,1
1,0,1,1,0
2,0,1,1,0
3,0,1,1,0
4,0,1,1,0
...,...,...,...,...
1333,0,1,1,0
1334,1,0,1,0
1335,1,0,1,0
1336,1,0,1,0


## standarize numerical features

In [6]:
X_num = mlu.get_numerical_features(data, y_name)
X_num_names = X_num['names']
data.drop(X_num_names, axis='columns', inplace=True)
X_num = X_num['array']
scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)

In [7]:
X_num, X_num_names

(array([[-1.43876426, -0.45332   ],
        [-1.50996545,  0.5096211 ],
        [-0.79795355,  0.38330685],
        ...,
        [-1.50996545,  1.0148781 ],
        [-1.29636188, -0.79781341],
        [ 1.55168573, -0.26138796]]),
 Index(['age', 'bmi'], dtype='object'))

In [8]:
data

0
1
2
3
4
...
1333
1334
1335
1336
1337


**Hicé hasta aquí**, queda pendiente revisar el resto del código para que quede más limpio

In [10]:
X = pd.merge(X_num, X_cat, left_index=True, right_index=True)

TypeError: Can only merge Series or DataFrame objects, a <class 'numpy.ndarray'> was passed

## Interactions 
In the EDA phase, several interactions seemed to be relevant, we will for starters see how the model behave if we include up to 2nd grade interactions

In [None]:
interaction_degree = 4
interaction = PolynomialFeatures(degree=interaction_degree, include_bias=False, interaction_only=False)
X_interaction = interaction.fit_transform(X)
X_interaction_names = interaction.get_feature_names(X_names)

In [None]:
features = ru.clean_interaction_features(X_interaction, X_interaction_names, categorical_names, dummies_inner_combinations, interaction_degree)

In [None]:
X_interaction = features['set']
X_interaction_names = features['names']

## Split into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(X_interaction, y, test_size=0.3)

tt_sets = {'train_features':train_features, 
           'test_features':test_features,
           'train_labels':train_labels,
           'test_labels':test_labels}

for t_set in tt_sets.items():
    print (f'{t_set[0]} shape: {t_set[1].shape}')

In [None]:
regularization = LassoCV()
regularization.fit(train_features, train_labels)
print(f'Best alpha using built-in LassoCV: {regularization.alpha_}')
print(f'Best score using built-in LassoCV: {regularization.score(train_features, train_labels)}')
coeficients = pd.Series(regularization.coef_, index=X_interaction_names)
print(f'Lasso picked {sum(coeficients != 0)} terms and eliminated the remaining {sum(coeficients==0)} terms')

In [None]:
coefficients = coeficients[coeficients!=0]
relevant_features = coefficients.index
relevant_features

## Dropping irrelevant interactions



In [None]:
train_features = ru.drop_features(train_features,
                                  X_interaction_names,
                                  relevant_features,
                                  selection_objective='keep')

test_features = ru.drop_features(train_features,
                                  X_interaction_names,
                                  relevant_features,
                                  selection_objective='keep')

train_features = pd.DataFrame(train_features['set'], columns=train_features['names'])
test_features = pd.DataFrame(test_features['set'], columns=test_features['names'])


In [None]:
train_features = remove_irrevelant_features(train_features, X_interaction_names, coefficients.index)
train_features_names = train_features['names']
train_features = train_features['set']

X_interaction_names = train_features_names

test_features = remove_irrevelant_features(test_features, X_interaction_names, coefficients.index)
test_features_names = test_features['names']
test_features = test_features['set']

### one function for both sets, would delete 3 lines of this block!!!

## train the model

In [None]:
model = LinearRegression(fit_intercept = True)
model.fit(train_features, train_labels)
train_predictions = model.predict(train_features)

In [None]:
beta_values = pd.DataFrame(model.coef_,
                          X_interaction_names,
                          columns=['coefficient'])

beta_values

## assesing performance - cross validation

In [None]:
folds = 30
mse_crossfold = []
mse_train_values = []
mse_test_values = []
for f in range (0, folds):
    from sklearn.model_selection import train_test_split
    train_features, test_features, train_labels, test_labels = train_test_split(X_interaction, y, test_size=0.3)
    train_predictions = model.predict(train_features)
    train_errors = train_predictions - train_labels
    mse_train = (train_errors**2).mean()
    mse_train_values.append(mse_train)
    mse_train = {'type':'train','mse':mse_train}
    mse_crossfold.append(mse_train)
    test_predictions = model.predict(test_features)
    test_errors = test_predictions - test_labels
    mse_test = (test_errors**2).mean()
    mse_test_values.append(mse_test)
    mse_test = {'type':'test','mse':mse_test}
    mse_crossfold.append(mse_test)
    
mse_crossfold = pd.DataFrame(mse_crossfold)

pfig0 = px.box(mse_crossfold, x='type', y='mse',
                title='Crossfold validation, Mean Squared Error (MSE)',
                color_discrete_sequence = px.colors.qualitative.D3
            )
pfig0.show()

print(f'Train MSE mean fold values: {np.array(mse_train_values).mean()}')
print(f'Test MSE mean fold values: {np.array(mse_test_values).mean()}')

Both train and test MSE fold values show little skewness and quite similar medians. As expected, the test MSE fold values show a larger dispersion do to the fact that these sets have less values and therefore larger magnitude square errorrs are harder to compensate when calculating the mean.

From the above statement we could say that the crossfold validation was successful, but if we really want to be obnoxious about it, since we would be dealing with the mean of means and we have a large enough (n=100) set of observations, we could appeal to the central limit theorem (the mean of means follow a normal distribution) and perform a two sided hypothesis testing to show that the means of the MSE crossfold values aren't statistically different.

In [None]:

displot = plt.figure(figsize=(12.8,8.16))
title = f'distribution of MSE: crossfold validation'
fig = sns.displot(data=mse_crossfold, x='mse', hue='type', kind='kde', palette='muted')
fig.set(title=title)
file_name = title + ' ' + datetime.now().isoformat()[:19]
fig.savefig(file_name, bbox_inches='tight')
plt.figure()



### Aspin-Welch Unequal-Variance T-Test
[reference](https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Two-Sample_T-Test_from_Means_and_SDs.pdf)

- null: mean of the MSE of train and test are the same
- alternate : mean of the MSE of train and test are different

In [None]:
aspin_welch_result = stats.ttest_ind(mse_train_values, mse_test_values, axis=0, equal_var=False, nan_policy='omit')

In [None]:
aspin_welch_result

With a significance level of alpha=0.05, since this is a 2 sided test, the observed pvalue should be either lower than 0.025 or higher than 0.975 to reject the null hypothesis, with an observed pvalue of 0.5935 there is no staistical evidence to reject it. In other words, there is no evidence to make us think that the means of MSE fold values of the trainning and testing sets are different.

# Inferential statistics assumptions
1. **Linearity**: It is assumed that the relationship between each predictor variable and the criterion variable is linear. 
    If this assumption is not met, then the predictions may systematically overestimate the actual values for one range of values on a predictor variable and underestimate them for another (bias).
    
    While working with high-dimensional data, it may not be practical to plot every dimension vs the prediction. An alternative is to use a prediction error plot, as it lets visualize how well the model does compared to the truth.


2. **Residuals are normaly distributed**. The residuals (aka. erros) are the difference between predictions and the real values of the labels found in the data set.



3. **Homoscedasticity**: Variances of the residuals are the same for all predicted values.

Even though moderate violations of Assumptions 1 to 3 do not present a serious threat for the significance of predictor variables, even small transgressions to them could compromise the validity on certain predictions.

In [None]:
X_interaction = pd.DataFrame(X_interaction, columns=X_interaction_names)
y = pd.DataFrame(data[y_name])
data = pd.merge(y, X_interaction, left_index=True, right_index=True)
data


In [None]:
def standarize_arr(_array):
    _arr_mean = _array.mean()
    _arr_stdev = _array.std()
    _normalized_arr = (_array - _arr_mean)/_arr_stdev
    return _normalized_arr

def create_error_analysis_df(_data, _y_name, _train_labels, _train_predictions, _test_labels, _test_predictions):
    _error_analysis_df_train = pd.DataFrame()
    _error_analysis_df_train[_y_name] = _train_labels
    _error_analysis_df_train['prediction'] = _train_predictions
    _error_analysis_df_train['split'] = 'train'
    
    _error_analysis_df_test = pd.DataFrame()
    _error_analysis_df_test[_y_name] = _test_labels
    _error_analysis_df_test['prediction'] = _test_predictions
    _error_analysis_df_test['split'] = 'test'
    
    _error_analysis_df = pd.concat([_error_analysis_df_train, _error_analysis_df_test], ignore_index=True)
    _error_analysis_df['residual'] = _error_analysis_df['prediction'] -  _error_analysis_df[y_name]
    _error_analysis_df['standarized_residual'] = standarize_arr(_error_analysis_df['residual'])
    
    _error_analysis_df['residual_theoretical_normal_P'] = stats.norm.cdf(_error_analysis_df['standarized_residual'])
    _error_analysis_df['residual_observed_P'] = _error_analysis_df['residual'].rank(pct = True) 
          
    return(_error_analysis_df)


In [None]:
error_data = create_error_analysis_df(data, y_name, train_labels, train_predictions, test_labels, 
test_predictions)

In [None]:
InteractiveShell.ast_node_interactivity = 'last'

In [None]:
fig1 = px.scatter(error_data,
                 x = y_name,
                 y = 'prediction',
                 marginal_x = 'histogram',
                 marginal_y = 'histogram',
                 color = 'split',
                 title = 'Linearity: Prediction error plot',
                 color_discrete_sequence = px.colors.qualitative.D3
               )

fig1.update_traces(histnorm='probability', selector={'type':'histogram'})

fig1.add_shape(type = 'line',
              line = {'dash' : 'dash'},
              x0 = y.min(), y0=y.min(),
              x1 = y.max(), y1=y.max()
              )

fig1.update_layout(xaxis = {'scaleanchor':'y', 'scaleratio':1, 'ticks':'outside'},
                   yaxis = {'ticks':'outside'},
                   autosize = False,
                   width = 500,
                   height = 500,
                   dragmode = False
                  )
fig1.show()

In [None]:
fig2 = px.scatter(error_data,
                 x = error_data['residual_theoretical_normal_P'],
                 y = error_data['residual_observed_P']     ,
                 color_discrete_sequence = px.colors.qualitative.D3          
               )

fig2.add_shape(type = 'line',
              line = {'dash' : 'dash'},
              x0 = 0, y0 = 0,
              x1 = 1, y1 = 1
              )

fig2.update_layout(xaxis = {'scaleanchor':'y', 'scaleratio':1, 'ticks':'outside'},
                   yaxis = {'ticks':'outside'},
                   autosize = False,
                   width = 500,
                   height = 500,
                   dragmode = False
                  )
fig2.show()

In [None]:
fig3 = px.scatter(error_data,
                 x = 'prediction',
                 y = 'standarized_residual',
                 color = 'split',
                 title = 'Residuals Homoscedasticity',
                 color_discrete_sequence = px.colors.qualitative.D3
               )

fig3.show()

In [None]:
lis