# Running Linear Regression Experiment with Budget Deficit Data

In [1]:
import pandas as pd
from normalize import zScoreNorm
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


training_x_header = 'Train Year'
training_y_header = 'Train Deficit ($ millions)'

test_x_header = 'Test Year'
test_y_header = 'Test Deficit ($ millions)'


In [2]:
#IMPORT ALL DATA 
all_train = pd.read_csv("deficit_train.dat", sep=" ", header=None, names=[training_x_header, training_y_header])
all_test = pd.read_csv("deficit_test.dat", sep=" ", header=None, names=[test_x_header, test_y_header])

In [3]:
#SEPARATE ATTRIBUTES FROM LABEL
x_train = all_train[training_x_header]
y_train = all_train[training_y_header]
x_test = all_test[test_x_header]
y_test = all_test[test_y_header]

In [4]:
#NECESSARY CONSTANTS
num_train_instances = x_train.size
num_test_instances = x_test.size
num_folds = 6
chunk = num_train_instances // num_folds
Lambda = 0
degrees = 12
n = degrees + 1

In [5]:
#PERFORMANCE METRICS
RMSE_per_fold_deg = pd.DataFrame(0.0, index = [f'fold {i}' for i in range(num_folds)], columns=[f'degree {i}' for i in range(degrees)])
RMSE_per_fold_deg_mean = pd.DataFrame(0.0, index = ['Mean'],columns=[f'degree {i}' for i in range(degrees)])
RMSE_min = 0
opt_deg = 0
RMSE_test = 0

In [9]:
#CREATE FOLDS FOR CROSS VALIDATION
for fold in range(num_folds):
    if fold == 0: #First fold = first 'chunk' is the holdout set, rest is training set
        x_train_folds = pd.Series(x_train[chunk:]).reset_index(drop=True) # reset_index restores regular order of partial series
        y_train_folds = pd.Series(y_train[chunk:]).reset_index(drop=True)
        x_holdout = pd.Series(x_train[0:chunk]).reset_index(drop=True)
        y_holdout = pd.Series(y_train[0:chunk]).reset_index(drop=True)
    else: #Other fold = instances from the first instance to right before the holdout set, continue past holdout set to the end
        x_train_folds = pd.concat([x_train[0: fold*chunk], x_train[(fold  + 1)*chunk:]]).reset_index(drop=True)
        y_train_folds = pd.concat([y_train[0: fold*chunk], y_train[(fold  + 1)*chunk:]]).reset_index(drop=True)
        x_holdout = pd.concat([x_train[fold*chunk: (fold + 1)*chunk]]).reset_index(drop=True)
        y_holdout = pd.concat([y_train[fold*chunk: (fold + 1)*chunk]]).reset_index(drop=True)
    #Folds are created. The following code repeats for every subsequent fold in the for loop 
    #NORMALIZATION
    x_train_folds_mean = x_train_folds.mean()
    x_train_folds_std = x_train_folds.std()
    y_train_folds_mean = y_train_folds.mean()
    y_train_folds_std = y_train_folds.std()
    
    #Apply zScoreNormalization, isOutput parameter normalizes the any output with the output training mean and std 
    x_train_folds = x_train_folds.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, False))
    y_train_folds = y_train_folds.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, True))
    x_holdout = x_holdout.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, False))
    y_holdout = y_holdout.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, True))

    #LEARN POLYNOMIALS FROM DEG 0 to 12
    for d in range(1, 13):
        poly = PolynomialFeatures(degree = d, include_bias=False)
        poly_train_features = poly.fit_transform(x_train_folds.array.reshape(-1, 1))
        poly_reg_model = LinearRegression().fit(poly_train_features, y_train_folds)
        if d == 6:  
            print(poly_reg_model.coef_)
        #Model has been calculated
        
        #y_pred = poly_reg_model.predict(poly_train_features)
        


[0.31906506]
[ 0.29983781 -0.15149593]
[ 1.74213054 -0.25806653 -0.84476627]
[ 2.06578356  2.22091603 -1.11709532 -1.05853108]
[ 0.79648199  2.42287731  1.00793008 -1.20762986 -0.69959516]
[ 0.67882253  1.37992959  1.25649585 -0.06113017 -0.80052129 -0.31123958]
[ 0.30211982  1.60218413  2.51246549 -0.3805916  -1.82869733 -0.20792288
  0.23882364]
[ 0.26553495  1.09164259  2.70464993  0.64976577 -2.03311009 -0.8704705
  0.29577401  0.13302987]
[ 0.69971102  0.80094648  0.0876321   1.48887747  1.97819083 -1.57072946
 -1.92716363  0.30470601  0.40581587]
[ 0.67858239  0.60428635  0.23820205  2.09081684  1.72636682 -2.23603428
 -1.77575425  0.60917296  0.37593594 -0.04912075]
[ 0.65180004  0.62256973  0.47959615  1.98455074  1.17177168 -2.07636504
 -1.26507279  0.52054393  0.17068899 -0.0327084   0.03004332]
[ 0.60426089  0.29178364  0.93258529  3.54586468  0.01129526 -4.83239719
 -0.06868392  2.7195079  -0.36179578 -0.84028738  0.11522629  0.11115114]
[0.45245195]
[ 0.4482708 -0.0296023]