In [263]:
""" As usual, we start by importing the relevant packages"""

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler # scales variables to be mean=0,sd=1
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LassoLarsCV
from sklearn.pipeline import Pipeline 
from sklearn.metrics import f1_score, accuracy_score # Are not actively used, but could be a valuable extension
import pandas as pd
import numpy as np

In [376]:
""" Since, regularized regression, as a training method, is able to detect 'useless' variables there is little need for a 
    rigorous pre-selection of variables. This function creates interaction terms of every variable. Further, it creates 
    second-order polynomials for each variable."""

def CombineAttributes(data, var_list):
    for i in var_list:
        for j in var_list:
            if i == j:
                name = str(i)+ '_square'
                data[name] = data.loc[:, i] * data.loc[:, i]

            else:
                name =  str(i)+ '_' +str(j)
                data[name] = data.loc[:, i] * data.loc[:, j]

    return data

In [380]:
""" Import the relevant data and clean it"""

dataframe_1 = pd.read_excel(r'C:\Users\mariu\Desktop\Project\Historical Data\Data_mai.xlsx')
dataframe = dataframe_1.dropna().reset_index()

In [382]:
# Declare X and Y variable

X_name = ['place_list', 'incumbent', 'woman', 'doctor', 'year', 'federal_election', 'artistocracy', 'google', 'google_zero',
         'google_b1000', 'google_b100', 'google_million', 'population', 'share_students', 'unemployment', 'share_old','CDU', 'SPD', 
         'Linke', 'FDP', 'Grüne', 'AfD', 'share_youth', 'share_migrants', 'share_pupils', 'FW', 'local_list'
         ]

X = dataframe[['place_list', 'incumbent', 'woman', 'doctor', 'year', 'federal_election', 'artistocracy', 'google', 'google_zero', 
               'google_b1000', 'google_b100', 'google_million', 'population', 'share_students', 'unemployment', 'share_old','CDU', 
               'SPD', 'Linke', 'FDP', 'Grüne', 'AfD', 'share_youth', 'share_migrants', 'share_pupils', 'FW', 'local_list'
              ]]
y = dataframe['votes']



# Make use of the interaction function from above
CombineAttributes(X, X_name)



# Split dataset, not necessary when predicting with the 2019 data -> train on historical data test on contemporaneous

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [319]:
" Here, a simple Lasso regression. Using default value 1."

lasso_reg = Lasso()
lasso_reg.fit(X_train, y_train)
lasso_pred = lasso_reg.predict(X)
comparison_lasso = pd.DataFrame({'Actual': y, 'Predicted': lasso_pred})
comparison_lasso.head(25), lasso_reg.coef_, lasso_reg.score(X_test, y_test) # Score is the R_sqrd 



(    Actual     Predicted
 0     3223   6666.117111
 1     1822   1173.859576
 2     1383   4325.283645
 3      950   2195.536819
 4     3304    746.358312
 5     3940   2979.885279
 6     2865   4624.280492
 7      937    103.606557
 8     1106   1291.268410
 9     2605   3014.800495
 10   19524  16212.859423
 11    7440   4862.435291
 12   38146  26617.361598
 13    3442   2686.072907
 14   41900  24032.520061
 15   10742  14327.022289
 16    1818   1799.039872
 17   22724  25565.015607
 18    6504   6233.586953
 19   15619  13978.311911
 20    6798   5715.078601
 21    1897   2349.664891
 22    4857   6333.165479
 23    1394   2716.036356
 24    4956   5358.145238,
 array([ 1.67183525e+01, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -1.86133121e+02, -2.82252107e+01,  0.00000000e+00, -3.20790662e-04,
         0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -1.01023828e-02, -0.00000000e+00, -0.00000000e+00,  1.87334311e+02,
         0.00000000

In [359]:
""" However, it makes more sense to use cross-validation, since the parameter alpha should be selected 
    by the data. Further, normalize the data to restrict the influence of outliers."""

res_lasso_cv = LassoCV(cv=10, n_alphas=10, normalize = True).fit(X, y)
res_lasso_cv.score(X,y), res_lasso_cv.coef_, res_lasso_cv.alpha_
lasso_comp_pred = res_lasso_cv.predict(X_2019)
dataframe_2019['Votes_predicted_computer_lasso'] = lasso_comp_pred # Add the totally computerized forecast to the dataset
pred_coef_comp = res_lasso_cv.coef_
X2 = X.columns.values
list_coefficients = pd.DataFrame({'Variable': X2, 'Coefficient': pred_coef_comp}) # Create a dataset with the estimated coefficients
list_coefficients.to_excel(r'C:\Users\mariu\Desktop\Project\Coefficients.xlsx')



In [267]:
# Not really an improvement

""" Using LARS algorithm while cross validate the "alpha" parameter. (In the original paper it's actually called lambda) 
    Not sure if that's really helpful -> check that again"""

res_lasso_lars_cv = LassoLarsCV(cv=10, max_n_alphas=1000).fit(X_train, y_train)
res_lasso_lars_cv.score(X,y), res_lasso_lars_cv.coef_, res_lasso_lars_cv.alpha_



(0.8420038310366738,
 array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         2.61802003e-01,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -1.84661363e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -4.34963490e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -1.31546032e+02, -2.14880611e+02,  0.00000000e+00,  0.00000

In [323]:
""" Now, I'm trying to manualize the process. First, a grid of possible alpha(lambda) values is constructed. Standardize and regress 
    in a pipeline. Basically, this means you always standardize automatically before regressing when calling the pipeline. 
    Next, apply grid search to tune the hyperparameter alpha. Fit trainings data"""

param_grid = {'estimator__alpha': np.logspace(.001, 3, num=20, endpoint=False)}

lasso_pipe = Pipeline([('scale', StandardScaler()), ('estimator', Lasso())])

lin_cv = GridSearchCV(estimator=lasso_pipe,
                      param_grid = param_grid,
                      n_jobs=-1,
                      verbose=2,
                      cv=10)
lin_cv.fit(X_train, y_train)
final_prediction = lin_cv.best_estimator_.predict(X_test)
comparison_prediction = pd.DataFrame({'Actual': y_test, 'Prediction': final_prediction})
lin_cv.best_params_, lin_cv.best_score_, comparison_prediction.head(35)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   43.9s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  5.8min finished
  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


({'estimator__alpha': 63.153874537438504},
 0.7629899526844288,
       Actual    Prediction
 5162     957   1776.311131
 1396    1744   3247.999899
 7352    1152    431.303231
 1140   25158  17823.554192
 2299     639   1405.778216
 2755    2254   1671.826822
 5507    1452   1576.333755
 384     4119   2416.135917
 5439    4841   4124.611075
 1841    3797   1468.528702
 5705    4364   2526.601034
 4148    1488  -1593.540934
 3755   20091  20217.296976
 6946    1377   2031.063490
 3880    6361   7891.312754
 474     2396   1517.840525
 1818    1258   1646.511947
 5617    6732   6563.504691
 2327    2051   5343.491053
 6205    7085  12553.029336
 3808   12387  17058.663158
 5071     237    560.164591
 444     7678   1348.811910
 5368    1241   1623.801744
 2403    6656  11992.172352
 2554    1365   3525.695513
 3113   17544  16533.857953
 7424     851  -3089.165220
 5900   15574  11871.280047
 4253    1472   3892.319393
 7577    2612  17684.184817
 6838    2269   2121.256352
 3144   3586

In [351]:
" Doing what Pipeline does by hand to retrieve the exact coefficients"

X_train_scaled = StandardScaler().fit_transform(X_train)
lasso_regress = Lasso(alpha = 63.153874537438504) # from the Grid search before
lasso_regress.fit(X_train, y_train)
pred_coef = lasso_regress.coef_
X2 = X.columns.values
list_coefficients = pd.DataFrame({'Variable': X2, 'Coefficient': pred_coef})
list_coefficients.head(60)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


(                       Variable  Coefficient
 0                    place_list    -0.000000
 1                     incumbent     0.000000
 2                         woman    -0.000000
 3                        doctor     0.000000
 4                          year    -0.000000
 5              federal_election    -0.000000
 6                  artistocracy     0.000000
 7                        google    -0.000337
 8                   google_zero     0.000000
 9                  google_b1000    -0.000000
 10                  google_b100     0.000000
 11               google_million     0.000000
 12                   population    -0.037683
 13               share_students    -0.000000
 14                 unemployment     0.000000
 15                    share_old     0.000000
 16                          CDU    -0.000000
 17                          SPD     0.000000
 18                        Linke     0.000000
 19                          FDP     0.000000
 20                        Grüne  

In [392]:
""" We're now turning to make the 2019 (out-of-sample) prediction. We train on the entire historical dataset and predict on
    the new data. Old data: still X and y"""


dataframe_2019 = pd.read_excel(r'C:\Users\mariu\Desktop\Project\2019_Data\Data_BW_2019.xlsx')
dataframe_2019 = dataframe_2019
X_2019 = dataframe_2019[['place_list', 'incumbent', 'woman', 'doctor', 'year', 'federal_election', 'artistocracy', 'google', 'google_zero', 
               'google_b1000', 'google_b100', 'google_million', 'population', 'share_students', 'unemployment', 'share_old','CDU', 
               'SPD', 'Linke', 'FDP', 'Grüne', 'AfD', 'share_youth', 'share_migrants', 'share_pupils', 'FW', 'local_list'
              ]]

X_2019_2 = X_2019.dropna()

CombineAttributes(X_2019_2, X_name)

#lin_cv.fit(X, y) # Use all historic data

final_prediction = lin_cv.best_estimator_.predict(X_2019_2) # predict with new set of variables
lin_cv.best_params_, final_prediction

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
  Xt = transform.transform(Xt)


({'estimator__alpha': 708.027294346627},
 array([6711.89434254, 6664.10284244, 6616.31134234, ...,  941.48416104,
         896.94224829,  852.40033555]))

In [411]:
""" Lastly, we collect the estimations and generate the data set"""


X_2019_2['votes'] = final_prediction
#X_2019_2.to_excel(r'C:\Users\mariu\Desktop\Project\Lasso_predictions_whole_data.xlsx')
data_w_pred = pd.merge(dataframe_2019, X_2019_2, left_index=True, right_index=True)
final_data = pd.DataFrame({'pred_votes': data_w_pred['votes'], 'Name': data_w_pred['Name_total'],
                          'city': data_w_pred['city'], 'party': data_w_pred['party'], 'place_list': dataframe_2019['place_list']})
final_data.to_excel(r'C:\Users\mariu\Desktop\Project\Lasso_prediction.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [413]:
data_w_pred.to_excel(r'C:\Users\mariu\Desktop\Project\Lasso_prediction_full_set.xlsx')