#### Poisson regression

In [None]:
# Imports
import warnings
warnings.filterwarnings("ignore")

import numpy
import pandas
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_squared_error, mean_poisson_deviance

import import_ipynb
import data_analysis

In [None]:
def results_summary_reg(ytrain_true, yval_true, ytest_true, ytrain_pred, yval_pred, ytest_pred):
    # for global model
    Summary_index = ['Training set', 'Validation set', 'Testing set']
    Summary_columns = ['RMSE', 'MSE', 'MPD']
    Summary_results = pandas.DataFrame(index=Summary_index, columns=Summary_columns)

    # performance of the global model
    Summary_results.loc[Summary_index[0], Summary_columns[0]] = round(root_mean_squared_error(ytrain_true, ytrain_pred), 4)
    Summary_results.loc[Summary_index[0], Summary_columns[1]] = round(mean_squared_error(ytrain_true, ytrain_pred), 4)
    Summary_results.loc[Summary_index[0], Summary_columns[2]] = round(mean_poisson_deviance(ytrain_true, ytrain_pred), 4)
    
    Summary_results.loc[Summary_index[1], Summary_columns[0]] = round(root_mean_squared_error(yval_true, yval_pred), 4)
    Summary_results.loc[Summary_index[1], Summary_columns[1]] = round(mean_squared_error(yval_true, yval_pred), 4)
    Summary_results.loc[Summary_index[1], Summary_columns[2]] = round(mean_poisson_deviance(yval_true, yval_pred), 4)
    
    Summary_results.loc[Summary_index[2], Summary_columns[0]] = round(root_mean_squared_error(ytest_true, ytest_pred), 4)
    Summary_results.loc[Summary_index[2], Summary_columns[1]] = round(mean_squared_error(ytest_true, ytest_pred), 4)
    Summary_results.loc[Summary_index[2], Summary_columns[2]] = round(mean_poisson_deviance(ytest_true, ytest_pred), 4)
    
    return Summary_results

In [None]:
def C_validation_reg(X_train, X_val, y_train, y_val): 
    n_samples, n_features = X_train.shape
    C_params, mpd_scores = (numpy.array([0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]) / n_samples), []

    for C in C_params:
        regressor = PoissonRegressor(alpha=C)
        regressor.fit(X_train, y_train)
        yval_pred = regressor.predict(X_val)
        mpd_scores.append(mean_poisson_deviance(y_val, yval_pred))
    return C_params[numpy.argmin(mpd_scores)]

In [None]:
def Poisson_regression(data, target_name, train_size=0.70, C_param=100, C_validation=False, times=1, check_multicollinearity=True, return_flag='simple'):
    # uncouping X and y
    X, y = data_analysis.uncouping_x_y_reg(data, target_name)
    
    random_states = numpy.arange(times)
    for i, random_state in enumerate(random_states):
        # split the dataset X into the training set X_train and temporary set X_temp
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size = train_size, random_state=random_state)
        # split the dataset X_temp into the validation set X_val and testing set X_test
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, train_size = 0.5, random_state=0)
        X_train, X_val, X_test = data_analysis.reset_index_data(data_1=X_train, data_2=X_val, data_3=X_test, data_4=None)
    
        # data encoding (OneHotEncoder encoding for category variables) and Standardscaler scaling
        X_train_enc, X_val_enc, X_test_enc = data_analysis.data_processing(xtrain=X_train.copy(), ytrain=y_train.copy(), xtest_1=X_val.copy(), xtest_2=X_test.copy(), xtest_3=None, check_multicollinearity=check_multicollinearity)
   
        if i == 0:
            print('*********************************************** The Poisson Regression ***********************************************')
            print(f'Training_set = {round((train_size * 100))}%, Validation_set = {round(((1 - train_size)/2) * 100)}%, Test_set = {round(((1 - train_size)/2) * 100)}%, C_validation = {C_validation}, times = {times}')

        # fitting model
        C_param = C_validation_reg(X_train_enc.copy(), X_val_enc.copy(), y_train.copy(), y_val.copy()) if C_validation == True else C_param
        regressor = PoissonRegressor(alpha=C_param)
        regressor.fit(X_train_enc, y_train)

        # predictions
        y_train_preds, y_val_preds, y_test_preds = regressor.predict(X_train_enc), regressor.predict(X_val_enc), regressor.predict(X_test_enc)
    
        # get summary
        summary_random = results_summary_reg(y_train, y_val, y_test, y_train_preds, y_val_preds, y_test_preds)
        w = regressor.coef_
        b = regressor.intercept_
        
        if (i == 0):
            summary_random_state = summary_random
            if return_flag!='simple':
                w_random_state = w
                b_random_state = b
        else:
            summary_random_state += summary_random
            if return_flag!='simple':
                w_random_state += w
                b_random_state += b
    
    # end of for loop
    summary = (summary_random_state / times).astype('float64')
    summary = summary.round(4)
    
    if return_flag=='simple':
        return summary
    else:
        w = (w_random_state / times)
        b = (b_random_state / times)
        return w, b, summary, X_train_enc, X_test_enc, y_train.copy(), y_test.copy(), y_train_preds.copy(), y_test_preds.copy()
        