# Running Linear Regression Experiment with Budget Deficit Data

In [2]:
import pandas as pd
from normalize import zScoreNorm
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt


training_x_header = 'Train Year'
training_y_header = 'Train Deficit ($ millions)'

test_x_header = 'Test Year'
test_y_header = 'Test Deficit ($ millions)'


In [3]:
#IMPORT ALL DATA 
all_train = pd.read_csv("deficit_train.dat", sep=" ", header=None, names=[training_x_header, training_y_header])
all_test = pd.read_csv("deficit_test.dat", sep=" ", header=None, names=[test_x_header, test_y_header])

In [4]:
#SEPARATE ATTRIBUTES FROM LABEL
x_train = all_train[training_x_header]
y_train = all_train[training_y_header]
x_test = all_test[test_x_header]
y_test = all_test[test_y_header]

In [5]:
#NECESSARY CONSTANTS
num_train_instances = x_train.size
num_test_instances = x_test.size
num_folds = 6
chunk = num_train_instances // num_folds
Lambda = 0
degrees = 12
n = degrees + 1

In [6]:
#PERFORMANCE METRICS
RMSE_per_fold_deg = pd.DataFrame(0.0, index = [f'fold {i}' for i in range(num_folds)], columns=[f'degree {i}' for i in range(degrees)])
RMSE_per_fold_deg_mean = pd.DataFrame(0.0, index = ['Mean'],columns=[f'degree {i}' for i in range(degrees)])
RMSE_min = 0
opt_deg = 0
RMSE_test = 0

In [24]:
#CREATE FOLDS FOR CROSS VALIDATION
for fold in range(num_folds):
    if fold == 0: #First fold = first 'chunk' is the holdout set, rest is training set
        x_train_folds = pd.Series(x_train[chunk:]).reset_index(drop=True) # reset_index restores regular order of partial series
        y_train_folds = pd.Series(y_train[chunk:]).reset_index(drop=True)
        x_holdout = pd.Series(x_train[0:chunk]).reset_index(drop=True)
        y_holdout = pd.Series(y_train[0:chunk]).reset_index(drop=True)
    else: #Other fold = instances from the first instance to right before the holdout set, continue past holdout set to the end
        x_train_folds = pd.concat([x_train[0: fold*chunk], x_train[(fold  + 1)*chunk:]]).reset_index(drop=True)
        y_train_folds = pd.concat([y_train[0: fold*chunk], y_train[(fold  + 1)*chunk:]]).reset_index(drop=True)
        x_holdout = pd.concat([x_train[fold*chunk: (fold + 1)*chunk]]).reset_index(drop=True)
        y_holdout = pd.concat([y_train[fold*chunk: (fold + 1)*chunk]]).reset_index(drop=True)
    #Folds are created. The following code repeats for every subsequent fold in the for loop 
    #NORMALIZATION
    x_train_folds_mean = x_train_folds.mean()
    x_train_folds_std = x_train_folds.std()
    y_train_folds_mean = y_train_folds.mean()
    y_train_folds_std = y_train_folds.std()
    
    #Apply zScoreNormalization, isOutput parameter normalizes the any output with the output training mean and std 
    x_train_folds = x_train_folds.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, False))
    y_train_folds = y_train_folds.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, True))
    x_holdout = x_holdout.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, False))
    y_holdout = y_holdout.apply(zScoreNorm, args=(x_train_folds_mean, x_train_folds_std, y_train_folds_mean, y_train_folds_std, True))

    #LEARN POLYNOMIALS FROM DEG 0 to 12
    for d in range(1, 13):
        poly = PolynomialFeatures(degree = d, include_bias=False)
        poly_train_features = poly.fit_transform(x_train_folds.array.reshape(-1, 1), y_train_folds.array)
        poly_reg_model = LinearRegression().fit(poly_train_features, y_train_folds)
        #Models have been confirmed with Matlab coefficients 
        poly_reg_model.coef_
        poly_reg_model.intercept_

        #Model has been calculated
        
        #y_pred = poly_reg_model.predict(poly_train_features)
        


array([0.31906506])

1.7036651400450347e-15

array([ 0.29983781, -0.15149593])

0.14716747178133513

array([ 1.74213054, -0.25806653, -0.84476627])

0.14654215442347115

array([ 2.06578356,  2.22091603, -1.11709532, -1.05853108])

-0.5231005727892333

array([ 0.79648199,  2.42287731,  1.00793008, -1.20762986, -0.69959516])

-0.5032069267957741

array([ 0.67882253,  1.37992959,  1.25649585, -0.06113017, -0.80052129,
       -0.31123958])

-0.37033563625933624

array([ 0.30211982,  1.60218413,  2.51246549, -0.3805916 , -1.82869733,
       -0.20792288,  0.23882364])

-0.3769217070659281

array([ 0.26553495,  1.09164259,  2.70464993,  0.64976577, -2.03311009,
       -0.8704705 ,  0.29577401,  0.13302987])

-0.3386658352397529

array([ 0.69971102,  0.80094648,  0.0876321 ,  1.48887747,  1.97819083,
       -1.57072946, -1.92716363,  0.30470601,  0.40581587])

-0.3337159771930547

array([ 0.67858239,  0.60428635,  0.23820205,  2.09081684,  1.72636682,
       -2.23603428, -1.77575425,  0.60917296,  0.37593594, -0.04912075])

-0.32254033947588123

array([ 0.65180004,  0.62256973,  0.47959615,  1.98455074,  1.17177168,
       -2.07636504, -1.26507279,  0.52054393,  0.17068899, -0.0327084 ,
        0.03004332])

-0.3223347880703631

array([ 0.60426089,  0.29178364,  0.93258529,  3.54586468,  0.01129526,
       -4.83239719, -0.06868392,  2.7195079 , -0.36179578, -0.84028738,
        0.11522629,  0.11115114])

-0.30815461268985667

array([0.45245195])

9.59375949581116e-16

array([ 0.4482708, -0.0296023])

0.028756523470877697

array([ 1.74341941, -0.20492578, -0.74719658])

0.09654891305952236

array([ 2.1794156 ,  2.08187972, -1.12505915, -0.97520813])

-0.5032938948088242

array([ 0.96629324,  2.6444309 ,  0.92682706, -1.30224365, -0.68342568])

-0.5845223535600551

array([ 0.74869925,  1.53394453,  1.41869665, -0.07703864, -0.88608509,
       -0.33709676])

-0.44617705105254496

array([ 0.48709381,  1.7924953 ,  2.30038579, -0.44132863, -1.61237145,
       -0.21764452,  0.17124978])

-0.46647186492480514

array([ 0.41243829,  1.31712571,  2.64013987,  0.54392628, -1.95980516,
       -0.86535325,  0.26744031,  0.133465  ])

-0.43243471623743607

array([ 0.77975254,  0.80820722,  0.49717288,  1.87823472,  1.3011397 ,
       -1.9245325 , -1.55053388,  0.38690337,  0.33741692])

-0.40755418038375063

array([ 0.90607987,  1.49092497, -0.34269298, -0.29932973,  2.71342426,
        0.54550286, -2.41130746, -0.7711838 ,  0.51012159,  0.19198958])

-0.44244924660979895

array([ 0.58729111,  2.25654999,  2.43029874, -3.37567043, -3.69839026,
        4.58921077,  3.5963678 , -2.87137953, -1.96580275,  0.56705334,
        0.37415458])

-0.46814819109071576

array([  0.11185725,   0.84717457,   6.90711305,   3.51799389,
       -15.05866942,  -8.05403942,  15.15438893,   7.66582511,
        -7.05196787,  -3.49543247,   1.18197297,   0.58936115])

-0.4139860001667371

array([0.35026825])

8.358478432120693e-17

array([ 0.33955341, -0.10688058])

0.10382684941826757

array([ 1.81323187, -0.24035246, -0.86135141])

0.1496014534813942

array([ 2.02829586,  2.09836848, -1.07072155, -0.9869406 ])

-0.48749145206352623

array([ 0.79230097,  2.35423029,  1.02935525, -1.14780527, -0.69206207])

-0.5211716423752419

array([ 0.72816629,  1.65575202,  1.17271348, -0.39747114, -0.75124638,
       -0.20016416])

-0.4276226262687793

array([ 0.28059408,  1.89439448,  2.62545633, -0.71908876, -1.90002991,
       -0.09895454,  0.25841277])

-0.44833600031035686

array([ 0.19824941,  1.11210322,  2.97764057,  0.84211576, -2.24176577,
       -1.08195555,  0.34799541,  0.19260023])

-0.38762363572405073

array([ 0.71069851,  0.89570054,  0.22974502,  1.48361064,  1.66485807,
       -1.61758056, -1.68753208,  0.32175345,  0.35192498])

-0.38061140386736286

array([ 0.72349817,  0.97190688,  0.15704503,  1.23911189,  1.77631745,
       -1.3452199 , -1.75097733,  0.19856178,  0.36390576,  0.01944058])

-0.3844634248381013

array([ 0.04632899,  1.32034503,  5.40073787, -0.73556061, -9.62500011,
        1.5484547 ,  8.38104178, -1.36949557, -3.58246499,  0.30267591,
        0.55980088])

-0.3794187015983432

array([ -0.065792  ,   0.75386565,   6.29292363,   2.14777714,
       -11.80457776,  -3.66415788,  10.57849727,   2.80839364,
        -4.54402367,  -1.22262985,   0.711021  ,   0.20722334])

-0.36157846931149246

array([0.15984636])

5.565508693145478e-16

array([ 0.17952242, -0.27992768])

0.2719297433028868

array([ 1.81664384, -0.0789109 , -0.96516727])

0.14255940759457758

array([ 1.86540648,  2.6707117 , -0.8644777 , -1.21314564])

-0.5191746636455701

array([ 0.53149391,  2.51660024,  1.53625236, -1.10895023, -0.8319379 ])

-0.4305660076214953

array([ 0.59841365,  1.7385304 ,  1.39057443, -0.23394818, -0.77148173,
       -0.24469256])

-0.3395797526172398

array([-0.10361096,  1.56154109,  3.93887594,  0.0858703 , -2.92025936,
       -0.36420568,  0.51148472])

-0.308254988166198

array([-3.14054371e-03,  4.06470682e-01,  3.61701930e+00,  2.57100804e+00,
       -2.58843282e+00, -2.02971215e+00,  4.15828372e-01,  3.45968974e-01])

-0.22901902651406958

array([ 0.42392245,  0.94048897,  0.83083505,  1.30254054,  1.87830769,
       -1.06183631, -2.15915812,  0.11927142,  0.48842284])

-0.2652624106635453

array([ 0.53555281, -0.68084971,  0.16305588,  6.56593778,  3.15033848,
       -7.05738889, -3.01838254,  2.92563026,  0.67360309, -0.46240311])

-0.1817998262999292

array([ 0.78908057, -0.3026562 , -2.44355631,  5.26822829,  9.67500259,
       -5.33245962, -9.39027058,  1.98246977,  3.35515828, -0.28453341,
       -0.40824405])

-0.19995221254902695

array([ 0.78365101, -0.20286183, -2.39374239,  4.79049146,  9.53883725,
       -4.48115718, -9.23588176,  1.29791706,  3.2797666 , -0.03128893,
       -0.39513873, -0.03511085])

-0.20395987250664582

array([0.28579213])

8.789076567569692e-16

array([ 0.28865715, -0.22824456])

0.2217232852400003

array([ 1.83157134, -0.206613  , -0.91452342])

0.21186126362675545

array([ 1.87026923,  2.1422319 , -0.92479535, -1.0107664 ])

-0.4128915053024175

array([ 0.53768766,  2.20851974,  1.32288888, -1.03985899, -0.74461686])

-0.42061524945560785

array([ 0.56654414,  1.02261378,  1.28627572,  0.25900403, -0.73504103,
       -0.35069356])

-0.2724273221065899

array([ 0.38287842,  1.07263775,  1.94319478,  0.21023186, -1.28910022,
       -0.33816035,  0.1302727 ])

-0.2799642220441096

array([ 0.40734879,  0.31917461,  1.9173188 ,  1.73923643, -1.28576294,
       -1.33325776,  0.13201412,  0.2013973 ])

-0.22111981095364738

array([ 1.01412576,  0.48235306, -1.79377676,  1.34593996,  4.35588453,
       -1.06823857, -2.97803117,  0.14834169,  0.56491842])

-0.21963360754367997

array([ 1.02076342,  0.88382697, -1.84960322,  0.07337355,  4.42767837,
        0.35382801, -3.0100963 , -0.50159096,  0.56971829,  0.10393954])

-0.24190233619959658

array([ 0.91218747,  0.83051939, -0.87702286,  0.18529931,  2.1839031 ,
        0.26326858, -0.93991017, -0.46855383, -0.2595048 ,  0.09942522,
        0.12030536])

-0.2399461102697881

array([ 0.9166634 ,  0.98124043, -0.90186004, -0.54175107,  2.23062876,
        1.53698606, -0.97911005, -1.4679323 , -0.24462537,  0.4583109 ,
        0.11821839, -0.04808007])

-0.2458669909215849

array([0.1170143])

3.0909227863437966e-16

array([ 0.13025401, -0.32163442])

0.31244486609517974

array([ 1.68347991, -0.12094921, -0.84551845])

0.15130388693327762

array([ 1.67177757,  2.09576183, -0.73412143, -0.88217756])

-0.4238870838929275

array([ 0.52619872,  2.0654884 ,  1.15571537, -0.83859859, -0.58513033])

-0.36442141271871636

array([ 0.53943646,  0.98393254,  1.14418872,  0.2784158 , -0.57348749,
       -0.27996739])

-0.2251949256826909

array([ 0.17249431,  0.92894306,  2.34775236,  0.33660149, -1.49476581,
       -0.29812305,  0.19648509])

-0.2062296814540301

array([ 0.20458487,  0.48980383,  2.2538772 ,  1.16905534, -1.41479845,
       -0.79135865,  0.17741699,  0.09030726])

-0.17205909875354952

array([ 0.78150631,  0.9301736 , -1.01531692,  0.25240565,  3.16329917,
       -0.16626655, -2.12097384, -0.03860782,  0.37848243])

-0.2139157456289304

array([ 0.78661662,  0.83946223, -1.04252407,  0.5351517 ,  3.20957664,
       -0.46824843, -2.14969464,  0.09094165,  0.38410139, -0.01923891])

-0.20912044889650344

array([ 0.45053584,  0.32886069,  1.79943871,  2.0724429 , -3.0371866 ,
       -2.1923537 ,  3.32338864,  0.89367189, -1.68734673, -0.14909967,
        0.28256992])

-0.1771371956985925

array([ 0.42880411,  0.87235334,  2.03439811, -0.47841385, -3.69841014,
        2.07289797,  4.03618899, -2.27595456, -2.00810702,  0.92461949,
        0.33301806, -0.13527677])

-0.19756961300413772