# Running Linear Regression Experiment with Budget Deficit Data

In [100]:
import pandas as pd
from normalize import zScoreNorm
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


training_x_header = 'Train Year'
training_y_header = 'Train Deficit ($ millions)'

test_x_header = 'Test Year'
test_y_header = 'Test Deficit ($ millions)'


In [101]:
#IMPORT ALL DATA 
all_train = pd.read_csv("deficit_train.dat", sep=" ", header=None, names=[training_x_header, training_y_header])
all_test = pd.read_csv("deficit_test.dat", sep=" ", header=None, names=[test_x_header, test_y_header])

In [102]:
#SEPARATE ATTRIBUTES FROM LABEL
x_train = all_train[training_x_header]
y_train = all_train[training_y_header]
x_test = all_test[test_x_header]
y_test = all_test[test_y_header]

In [103]:
#NECESSARY CONSTANTS
num_train_instances = x_train.size
num_test_instances = x_test.size
num_folds = 6
chunk = num_train_instances // num_folds
Lambda = 0
degrees = 12
n = degrees + 1

In [None]:
#PERFORMANCE METRICS
RMSE_per_fold_deg = pd.DataFrame(0.0, index = [f'fold {i}' for i in range(num_folds)], columns=[f'degree {i}' for i in range(degrees)])
RMSE_per_fold_deg_mean = pd.DataFrame(0.0, index = ['Mean'],columns=[f'degree {i}' for i in range(degrees)])
RMSE_min = 0
opt_deg = 0
RMSE_test = 0

In [124]:
#CREATE FOLDS FOR CROSS VALIDATION
for fold in range(num_folds):
    if fold == 0: #First fold = first 'chunk' is the holdout set, rest is training set
        x_train_folds = pd.Series(x_train[chunk:]).reset_index(drop=True) # reset_index restores regular order of partial series
        y_train_folds = pd.Series(y_train[chunk:]).reset_index(drop=True)
        x_holdout = pd.Series(x_train[0:chunk]).reset_index(drop=True)
        y_holdout = pd.Series(y_train[0:chunk]).reset_index(drop=True)
    else: #Other fold = instances from the first instance to right before the holdout set, continue past holdout set to the end
        x_train_folds = pd.concat([x_train[0: fold*chunk], x_train[(fold  + 1)*chunk:]]).reset_index(drop=True)
        y_train_folds = pd.concat([y_train[0: fold*chunk], y_train[(fold  + 1)*chunk:]]).reset_index(drop=True)
        x_holdout = pd.concat([x_train[fold*chunk: (fold + 1)*chunk]]).reset_index(drop=True)
        y_holdout = pd.concat([y_train[fold*chunk: (fold + 1)*chunk]]).reset_index(drop=True)
    #Folds are created. The following code repeats for every subsequent fold in the for loop 
    #NORMALIZATION
    x_train_folds_mean = x_train_folds.mean()
    x_train_folds_std = x_train_folds.std()
    y_train_folds_mean = y_train_folds.mean()
    y_train_folds_std = y_train_folds.std()
    
    #Apply zScoreNormalization, isOutput parameter normalizes the any output with the output training mean and std 
    x_train_folds = x_train_folds.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, False))
    y_train_folds = y_train_folds.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, True))
    x_holdout = x_holdout.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, False))
    y_holdout = y_holdout.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, True))

    #LEARN POLYNOMIALS FROM DEG 0 to 12
    for d in range(13):
        poly = PolynomialFeatures(degree = d, include_bias=False)
        poly_features = poly.fit_transform(x_train_folds.array.reshape(-1, 1))
        




(35, 0)

(35, 1)

(35, 2)

(35, 3)

(35, 4)

(35, 5)

(35, 6)

(35, 7)

(35, 8)

(35, 9)

(35, 10)

(35, 11)

(35, 12)

(35, 0)

(35, 1)

(35, 2)

(35, 3)

(35, 4)

(35, 5)

(35, 6)

(35, 7)

(35, 8)

(35, 9)

(35, 10)

(35, 11)

(35, 12)

(35, 0)

(35, 1)

(35, 2)

(35, 3)

(35, 4)

(35, 5)

(35, 6)

(35, 7)

(35, 8)

(35, 9)

(35, 10)

(35, 11)

(35, 12)

(35, 0)

(35, 1)

(35, 2)

(35, 3)

(35, 4)

(35, 5)

(35, 6)

(35, 7)

(35, 8)

(35, 9)

(35, 10)

(35, 11)

(35, 12)

(35, 0)

(35, 1)

(35, 2)

(35, 3)

(35, 4)

(35, 5)

(35, 6)

(35, 7)

(35, 8)

(35, 9)

(35, 10)

(35, 11)

(35, 12)

(35, 0)

(35, 1)

(35, 2)

(35, 3)

(35, 4)

(35, 5)

(35, 6)

(35, 7)

(35, 8)

(35, 9)

(35, 10)

(35, 11)

(35, 12)