In [31]:
from sklearn.preprocessing import StandardScaler # scales variables to be mean=0, sd=1
from sklearn.pipeline import Pipeline # in case multiple things are performed at the same time
from sklearn.linear_model import LinearRegression # Baseline
from sklearn.ensemble import GradientBoostingRegressor # Model 3
from sklearn.ensemble import RandomForestRegressor # Model 2
from sklearn.linear_model import LassoCV # Model 1
from sklearn.linear_model import Lasso # To save time and not do the Cross-Validation every time
from sklearn.svm import LinearSVR # Model 4
from sklearn.model_selection import GridSearchCV # Hypertuning of parameters
from sklearn.ensemble import VotingRegressor # Ensemble estimator
import pandas as pd
import numpy as np
import time

In [4]:
""" Since, regularized regression, as a training method, is able to detect 'useless' variables there is little need for a 
    rigorous pre-selection of variables. This function creates interaction terms of every variable. Further, it creates 
    second-order polynomials for each variable."""

def CombineAttributes(data, var_list):
    for i in var_list:
        for j in var_list:
            if i == j:
                name = str(i)+ '_square'
                data[name] = data.loc[:, i] * data.loc[:, i]

            else:
                name =  str(i)+ '_' +str(j)
                data[name] = data.loc[:, i] * data.loc[:, j]

    return data

In [5]:

dataframe = pd.read_excel(r'C:\Users\mariu\Desktop\Project\All_Data_BW_1.xlsx')

X_name = ['place_list', 'incumbent', 'woman', 'doctor', 'time', 'federal_election', 'aristocracy', 'google_stan', 'population',
          'share_students', 'unemployment', 'share_old','CDU', 'SPD', 'Linke', 'FDP', 'Grüne', 'AfD', 'share_youth',
          'share_migrants', 'share_pupils', 'FW', 'local_list', 'federal_difference', 'youth_list', 'green_alt_list', 
          'muslim_migrant', 'non_muslim_migrant', 'double_name', 'first_time'
         ]

# Generate training (pre-2019) and test (2019) datasets
dataframe_test = dataframe[dataframe['year'] == 2019].reset_index().drop(['index'], axis=1)
dataframe_train = dataframe[dataframe['year'] != 2019].dropna(subset = ['incumbent']).reset_index().drop(['index'], axis=1)

X_train = dataframe_train[['place_list', 'incumbent', 'woman', 'doctor', 'time', 'federal_election', 'aristocracy', 'google_stan', 'population',
              'share_students', 'unemployment', 'share_old','CDU', 'SPD', 'Linke', 'FDP', 'Grüne', 'AfD', 'share_youth',
              'share_migrants', 'share_pupils', 'FW', 'local_list', 'federal_difference', 'youth_list', 'green_alt_list', 
              'muslim_migrant', 'non_muslim_migrant', 'double_name', 'first_time'
              ]]

y_train = dataframe_train['votes']

X_test = dataframe_test[['place_list', 'incumbent', 'woman', 'doctor', 'time', 'federal_election', 'aristocracy', 'google_stan', 'population',
              'share_students', 'unemployment', 'share_old','CDU', 'SPD', 'Linke', 'FDP', 'Grüne', 'AfD', 'share_youth',
              'share_migrants', 'share_pupils', 'FW', 'local_list', 'federal_difference', 'youth_list', 'green_alt_list', 
              'muslim_migrant', 'non_muslim_migrant', 'double_name', 'first_time'
              ]]

y_test = dataframe_test['votes']

rand_for_train = dataframe_train[['place_list', 'incumbent', 'woman', 'doctor', 'time', 'federal_election', 'aristocracy', 'google_stan', 'population',
              'share_students', 'unemployment', 'share_old','CDU', 'SPD', 'Linke', 'FDP', 'Grüne', 'AfD', 'share_youth',
              'share_migrants', 'share_pupils', 'FW', 'local_list', 'federal_difference', 'youth_list', 'green_alt_list', 
              'muslim_migrant', 'non_muslim_migrant', 'double_name', 'first_time'
              ]]

rand_for_test = dataframe_test[['place_list', 'incumbent', 'woman', 'doctor', 'time', 'federal_election', 'aristocracy', 'google_stan', 'population',
              'share_students', 'unemployment', 'share_old','CDU', 'SPD', 'Linke', 'FDP', 'Grüne', 'AfD', 'share_youth',
              'share_migrants', 'share_pupils', 'FW', 'local_list', 'federal_difference', 'youth_list', 'green_alt_list', 
              'muslim_migrant', 'non_muslim_migrant', 'double_name', 'first_time'
              ]]

In [192]:

CombineAttributes(X_train, X_name)
CombineAttributes(X_test, X_name)

X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit(X_train).transform(X_test)

X2 = X_train.columns.values
X2_rand_for = rand_for_test.columns.values

rand_for_train_scaled = StandardScaler().fit_transform(rand_for_train)
rand_for_test_scaled = StandardScaler().fit(rand_for_train).transform(rand_for_test)

X_train

Unnamed: 0,place_list,incumbent,woman,doctor,time,federal_election,aristocracy,google_stan,population,share_students,...,first_time_share_pupils,first_time_FW,first_time_local_list,first_time_federal_difference,first_time_youth_list,first_time_green_alt_list,first_time_muslim_migrant,first_time_non_muslim_migrant,first_time_double_name,first_time_square
0,24,0.0,0,1,2,5.2,0,0.028109,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1
1,33,0.0,1,0,1,5.2,0,-0.119646,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1
2,38,0.0,0,0,2,5.2,0,-0.119578,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1
3,47,0.0,1,0,2,5.2,0,-0.119346,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1
4,18,0.0,0,0,2,5.2,0,-0.119562,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1
5,11,0.0,0,0,2,5.2,0,-0.119536,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1
6,39,0.0,1,0,2,5.2,0,-0.081079,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1
7,2,0.0,1,0,2,5.2,0,-0.119229,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1
8,31,0.0,0,0,1,5.2,0,-0.119609,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1
9,23,0.0,1,0,2,5.2,0,-0.119335,154715,0.238516,...,0.095886,0,0,0.2,0,0,0,0,0,1


In [9]:

t1 = time.time()
res_lasso_cv = LassoCV(cv=20, n_alphas=50, n_jobs = -1)
res_lasso_cv.fit(X_train_scaled, y_train)
t_lasso = time.time() - t1

# Display results
#res_lasso_cv_alphas = -np.log10(res_lasso_cv.alphas_ + EPSILON)

res_lasso_cv.score(X_train_scaled, y_train), res_lasso_cv.coef_, res_lasso_cv.alpha_

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


(0.7322741586238914,
 array([-3.25390411e+02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        -0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  1.68807480e+02,
         0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
         0.00000000e+00, -0.00000000e+00,  5.82701926e+02, -0.00000000e+00,
         0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00,  0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -8.09906549e+02,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -0.00000000e+00, -1.41205598e+02, -0.00000000e+00,  0.00000

In [10]:

lasso = Lasso(alpha = 302.1091452688529)

lasso_comp_pred = res_lasso_cv.predict(X_test_scaled)
pred_coef_comp = res_lasso_cv.coef_
X2 = X_train.columns.values

coefficients_lasso = pd.DataFrame({'Variable': X2, 'Coefficient': pred_coef_comp}) # Create a dataset with the estimated coefficients
coefficients_lasso.to_excel(r'C:\Users\mariu\Desktop\Project\Coefficients_Lasso.xlsx')

prediction_lasso = pd.DataFrame({'Prediction': lasso_comp_pred, 'Votes': dataframe_test['votes'], 'city': dataframe_test['city'], 'party': dataframe_test['party'], 'name': dataframe_test['Name_total']})
#prediction_lasso.to_excel(r'C:\Users\mariu\Desktop\Project\Prediction_Lasso.xlsx')

prediction_lasso.head(50), coefficients_lasso.head(70)

(      Prediction  Votes        city party                    name
 0    6253.035992   8163    Freiburg   AfD          Bernhard Lukau
 1    6344.340350   7711    Freiburg   AfD        Michael Theuring
 2    6070.427278   6504    Freiburg   AfD            Elmar Ertmer
 3    6161.731635   7703    Freiburg   AfD          Jens Bellemann
 4    5522.601135   8149    Freiburg   AfD          Martin Polheim
 5    5157.383706   4260    Freiburg   AfD         Robert Hagerman
 6    4792.166277   4721    Freiburg   AfD           Michael Braun
 7    5887.818564   6985    Freiburg   AfD             Bernd Fulde
 8    6618.253421   8931    Freiburg   AfD            Karl Schwarz
 9    5796.514206   6176    Freiburg   AfD             Jonas Möhle
 10   5066.079349   5674    Freiburg   AfD   Alessandro Bertonasco
 11   5979.122921   8008    Freiburg   AfD           Bernd Domnick
 12   6435.644707   8263    Freiburg   AfD            Jack Gelfort
 13   4974.774992   5585    Freiburg   AfD        Michael Pfei

In [11]:
param_grid_1 = {
    'bootstrap': [True],
    'max_depth': [90],
    'min_samples_leaf': [3],
    'min_samples_split': [10],
    'n_estimators': [86],
    'max_features':['auto']
}
rf = RandomForestRegressor()
random_forest = GridSearchCV(estimator = rf, param_grid = param_grid_1, 
                          cv = 5, verbose = 4, n_jobs = 1)
t2 = time.time()
random_forest.fit(rand_for_train_scaled, y_train)
t_random_forest = time.time() - t2


random_forest.score(rand_for_train_scaled,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86, score=0.699, total=   1.4s
[CV] bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86, score=0.380, total=   1.4s
[CV] bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.6s remaining:    0.0s


[CV]  bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86, score=0.329, total=   1.4s
[CV] bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.0s remaining:    0.0s


[CV]  bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86, score=0.742, total=   1.4s
[CV] bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86 
[CV]  bootstrap=True, max_depth=90, max_features=auto, min_samples_leaf=3, min_samples_split=10, n_estimators=86, score=0.173, total=   1.5s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.0s finished


0.9479491792761721

In [195]:

rf_1 = RandomForestRegressor(bootstrap = True, max_depth = 90, min_samples_leaf = 3, min_samples_split = 10, n_estimators = 86)
rf_1.fit(rand_for_train_scaled, y_train)

random_forest_pred = rf_1.predict(rand_for_test_scaled)

feature_importances_coef_comp = rf_1.feature_importances_

list_coefficients = pd.DataFrame({'Variable': X2_rand_for, 'Coefficient': feature_importances_coef_comp}) # Create a dataset with the estimated coefficients
list_coefficients.to_excel(r'C:\Users\mariu\Desktop\Project\Coefficients_Random_Forest.xlsx')

prediction_random_forest = pd.DataFrame({'Prediction': 
random_forest_pred, 'Votes': dataframe_test['votes'], 'city': dataframe_test['city'], 'party': dataframe_test['party'], 'name': dataframe_test['Name_total']})
prediction_random_forest.to_excel(r'C:\Users\mariu\Desktop\Project\Prediction_Random_Forest.xlsx')

prediction_random_forest.head(50), random_forest.best_estimator_, rf_1.feature_importances_, rf_1.n_features_

(     Prediction  Votes        city party                    name
 0   5855.545558   8163    Freiburg   AfD          Bernhard Lukau
 1   5947.302264   7711    Freiburg   AfD        Michael Theuring
 2   5342.404372   6504    Freiburg   AfD            Elmar Ertmer
 3   5705.850133   7703    Freiburg   AfD          Jens Bellemann
 4   4673.110880   8149    Freiburg   AfD          Martin Polheim
 5   3633.733045   4260    Freiburg   AfD         Robert Hagerman
 6   3045.624296   4721    Freiburg   AfD           Michael Braun
 7   5103.121878   6985    Freiburg   AfD             Bernd Fulde
 8   7321.717519   8931    Freiburg   AfD            Karl Schwarz
 9   5095.306755   6176    Freiburg   AfD             Jonas Möhle
 10  3728.866625   5674    Freiburg   AfD   Alessandro Bertonasco
 11  5165.838577   8008    Freiburg   AfD           Bernd Domnick
 12  6405.900374   8263    Freiburg   AfD            Jack Gelfort
 13  3076.932713   5585    Freiburg   AfD        Michael Pfeiffer
 14  5456.

In [14]:

param_test1 = {'n_estimators':range(20,200,20)}
param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,100)}
param_test3 = {'min_samples_leaf': [60] }

gradient_boosting_cv = GridSearchCV(estimator = GradientBoostingRegressor(random_state=1, min_samples_leaf = 60,
                                              n_estimators=180, loss = 'ls', learning_rate = 0.1, max_features = 'sqrt',
                                              criterion = 'mse', verbose = 10, max_depth = 5, min_samples_split = 300)
                                    , param_grid = param_test3, cv=5)

t3 = time.time()
gradient_boosting_cv.fit(X_train_scaled, y_train)
t_gradient_boosting = time.time() - t3

gradient_boosting_cv.best_params_, gradient_boosting_cv.best_score_
#gradient_boosting_cv.score(X_train,y_train), gradient_boosting_cv.feature_importances_, gradient_boosting_cv.train_score_

      Iter       Train Loss   Remaining Time 
         1    52203002.5074            2.50s
         2    45876040.6743            2.57s
         3    40987946.8284            2.59s
         4    36168260.4335            2.68s
         5    33260917.5246            2.69s
         6    30361353.0735            2.72s
         7    27694921.9411            2.76s
         8    25808744.5165            2.74s
         9    24075968.1492            2.73s
        10    22768566.3191            2.73s
        11    21330927.9576            2.73s
        12    20254714.8180            2.72s
        13    19268205.7238            2.73s
        14    18371385.9078            2.73s
        15    17704376.8239            2.73s
        16    16924361.9389            2.73s
        17    16302479.1410            2.71s
        18    15774979.1828            2.67s
        19    15225100.2784            2.66s
        20    14701721.3556            2.65s
        21    14311163.8910            2.64s
        2

        13    25412796.9571            2.66s
        14    23794952.2251            2.68s
        15    22236654.5174            2.67s
        16    21267888.9743            2.66s
        17    20405796.1375            2.63s
        18    19366856.6966            2.61s
        19    18701471.6120            2.60s
        20    17712437.4648            2.59s
        21    16795250.9038            2.59s
        22    16175459.3983            2.59s
        23    15625566.7720            2.58s
        24    15294393.0925            2.57s
        25    14790702.9998            2.55s
        26    14539936.2245            2.55s
        27    14155331.9506            2.53s
        28    13733854.0498            2.51s
        29    13297288.5241            2.49s
        30    13042676.2431            2.49s
        31    12901214.0108            2.47s
        32    12778350.5331            2.46s
        33    12630056.3402            2.47s
        34    12367887.5637            2.47s
        35

        24    10476988.4210            2.65s
        25    10107526.2183            2.63s
        26     9805632.3039            2.62s
        27     9428851.1394            2.61s
        28     9202538.0112            2.58s
        29     8940359.3969            2.56s
        30     8496323.0517            2.54s
        31     8349893.3379            2.52s
        32     8255261.0943            2.50s
        33     7929464.7363            2.47s
        34     7756653.6688            2.48s
        35     7554936.9234            2.46s
        36     7460201.0152            2.45s
        37     7332226.5719            2.44s
        38     7272339.0940            2.42s
        39     7074215.1960            2.39s
        40     6983943.6664            2.38s
        41     6828996.6028            2.36s
        42     6653919.4490            2.34s
        43     6591087.3079            2.33s
        44     6548946.9206            2.31s
        45     6316729.3446            2.29s
        46

        37    11778843.5790            2.39s
        38    11595011.7069            2.38s
        39    11371686.0465            2.36s
        40    11291120.7181            2.34s
        41    11027236.5119            2.33s
        42    10915718.5722            2.32s
        43    10846724.6171            2.31s
        44    10705225.1070            2.30s
        45    10563392.7037            2.29s
        46    10414461.3246            2.29s
        47    10358604.8765            2.28s
        48    10305499.4521            2.27s
        49    10192698.5594            2.26s
        50    10038471.0510            2.26s
        51     9988483.9010            2.24s
        52     9888364.2624            2.23s
        53     9813948.5915            2.22s
        54     9752253.6502            2.20s
        55     9632364.5956            2.19s
        56     9593616.4968            2.17s
        57     9525209.8675            2.16s
        58     9486836.3226            2.14s
        59

        49     9055324.0728            2.24s
        50     8963495.0164            2.22s
        51     8899312.6909            2.21s
        52     8849933.9329            2.19s
        53     8679634.1267            2.18s
        54     8630632.3426            2.17s
        55     8534994.3786            2.16s
        56     8479250.8423            2.15s
        57     8451737.5771            2.14s
        58     8401078.9212            2.13s
        59     8307730.3606            2.12s
        60     8264445.9414            2.11s
        61     8235455.4542            2.10s
        62     8198529.9692            2.08s
        63     8164581.2288            2.07s
        64     8135162.9013            2.05s
        65     8087378.2519            2.04s
        66     8054237.1336            2.02s
        67     8027825.5213            2.00s
        68     7952992.6025            1.98s
        69     7913855.7852            1.97s
        70     7885812.0567            1.95s
        71

        74     7416755.1568            0.80s
        75     7394423.9909            0.79s
        76     7382030.5571            0.79s
        77     7362843.6588            0.78s
        78     7344523.2951            0.77s
        79     7321372.8548            0.76s
        80     7311315.8184            0.75s
        81     7282611.8867            0.74s
        82     7251293.8379            0.73s
        83     7228323.1624            0.72s
        84     7212722.6041            0.71s
        85     7190648.9012            0.70s
        86     7176915.5604            0.70s
        87     7160185.6751            0.69s
        88     7147086.1637            0.68s
        89     7135001.0341            0.67s
        90     7125259.7850            0.66s
        91     7109356.0675            0.65s
        92     7082414.3776            0.65s
        93     7067219.9950            0.64s
        94     7057443.9488            0.63s
        95     7041562.9578            0.62s
        96

({'min_samples_leaf': 60}, 0.6367946263116123)

In [15]:

grad_boost = GradientBoostingRegressor(random_state=1, min_samples_leaf = 60,
                                              n_estimators=180, loss = 'ls', learning_rate = 0.1, max_features = 'sqrt',
                                              criterion = 'mse', verbose = 2, max_depth = 5, min_samples_split = 300)

grad_boost.fit(X_train_scaled, y_train)

gradient_boosting_pred = grad_boost.predict(X_test_scaled)
feature_importances_gradient_boost = grad_boost.feature_importances_

coef_gradient_boost = pd.DataFrame({'Variable': X2, 'Coefficient': feature_importances_gradient_boost}) # Create a dataset with the estimated coefficients
coef_gradient_boost.to_excel(r'C:\Users\mariu\Desktop\Project\Coefficients_Random_Forest.xlsx')

prediction_gradient_boost = pd.DataFrame({'Prediction': 
gradient_boosting_pred, 'Votes': dataframe_test['votes'], 'city': dataframe_test['city'], 'party': dataframe_test['party'], 'name': dataframe_test['Name_total']})
prediction_gradient_boost.to_excel(r'C:\Users\mariu\Desktop\Project\Prediction_Gradient_Boost.xlsx')

prediction_gradient_boost.head(50)

      Iter       Train Loss   Remaining Time 
         1    70733532.1339            1.43s
         2    63217727.7487            1.42s
         3    55927581.3378            1.41s
         4    49079678.9323            1.40s
         5    44041070.1302            1.40s
         6    40362723.2276            1.39s
         7    36665760.8005            1.40s
         8    33989072.7657            1.37s
         9    31554538.2614            1.36s
        10    28656724.7455            1.37s
        11    26018417.6513            1.35s
        12    24481485.6989            1.33s
        13    23077512.5397            1.31s
        14    21767079.7378            1.29s
        15    19958684.4925            1.28s
        16    19142870.2268            1.28s
        17    18302303.6075            1.25s
        18    17226331.3017            1.24s
        19    16318322.5363            1.23s
        20    15554170.3911            1.24s
        21    14667665.6650            1.23s
        2

Unnamed: 0,Prediction,Votes,city,party,name
0,5358.045529,8163,Freiburg,AfD,Bernhard Lukau
1,6410.46506,7711,Freiburg,AfD,Michael Theuring
2,5226.157179,6504,Freiburg,AfD,Elmar Ertmer
3,5354.590421,7703,Freiburg,AfD,Jens Bellemann
4,4608.469749,8149,Freiburg,AfD,Martin Polheim
5,4170.567824,4260,Freiburg,AfD,Robert Hagerman
6,3137.534641,4721,Freiburg,AfD,Michael Braun
7,4862.036928,6985,Freiburg,AfD,Bernd Fulde
8,6973.738649,8931,Freiburg,AfD,Karl Schwarz
9,4744.433272,6176,Freiburg,AfD,Jonas Möhle


In [40]:
Cs = [1, 10, 100, 1000, 2000, 3000, 4000]
param_grid_svr = {'C': Cs}

SVR = LinearSVR

svr_cv = GridSearchCV(estimator = LinearSVR, param_grid = param_grid_svr, cv=5, n_jobs = -1, verbose = 10)
svr_cv.fit(X_train_scaled, y_train)
svr_cv.best_params_


TypeError: Cannot clone object '<class 'sklearn.svm.classes.LinearSVR'>' (type <class 'abc.ABCMeta'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' methods.

In [42]:
svr = LinearSVR(C = 4150)

svr.fit(X_train_scaled, y_train)
svr_pred = svr.predict(X_test_scaled)

svr_pred_1 = pd.DataFrame({'Prediction': 
svr_pred, 'Votes': dataframe_test['votes'], 'city': dataframe_test['city'], 'party': dataframe_test['party'], 'name': dataframe_test['Name_total']})

svr.score(X_train_scaled, y_train), svr_pred_1.head(50)



(0.786516906590885,
       Prediction  Votes        city party                    name
 0   35857.326257   8163    Freiburg   AfD          Bernhard Lukau
 1   36070.631003   7711    Freiburg   AfD        Michael Theuring
 2   35477.644329   6504    Freiburg   AfD            Elmar Ertmer
 3   35670.631190   7703    Freiburg   AfD          Jens Bellemann
 4   34499.557690   8149    Freiburg   AfD          Martin Polheim
 5   34033.282551   4260    Freiburg   AfD         Robert Hagerman
 6   34477.430950   4721    Freiburg   AfD           Michael Braun
 7   35112.160703   6985    Freiburg   AfD             Bernd Fulde
 8   37197.414645   8931    Freiburg   AfD            Karl Schwarz
 9   34947.407742   6176    Freiburg   AfD             Jonas Möhle
 10  31304.568099   5674    Freiburg   AfD   Alessandro Bertonasco
 11  35297.685631   8008    Freiburg   AfD           Bernd Domnick
 12  36074.070146   8263    Freiburg   AfD            Jack Gelfort
 13  34110.095704   5585    Freiburg   AfD

In [43]:
svr.score(X_test_scaled, y_test)

0.5983915741164743

In [144]:
""" Hypertuning for Ensemble regression"""

ensemble_regression = VotingRegressor([('lasso', res_lasso_cv), ('random_forest', rf_1), ('gradient_boosting', grad_boost), ('SVR', svr)])

param_ensemble = {'weights': [(1, 1, 2, 1), (1, 2, 1, 1), (2, 1, 1, 1), (1, 1, 1, 2), (2, 2, 1, 1), (2, 1, 2, 1), 
                             (2, 1, 1, 2), (1, 2, 2, 1), (1, 2, 1, 2), (1, 1, 2, 2), (2, 2, 2, 1), (2, 2, 1, 2), (1, 2, 2, 2),
                             (1, 1, 1, 1), (1, 1, 1, 0), (1, 1, 0, 1), (1, 0, 1, 1), (0, 1, 1, 1), (1, 1, 0, 0), (1, 0, 1, 0),
                             (1, 0, 0, 1), (0, 1, 1, 0), (0, 1, 0, 1), (0, 0, 1, 1), (2, 2, 1, 0), (2, 2, 0, 1), (2, 1, 0, 2),
                             (2, 0, 1, 2), (1, 0, 2, 2), (0, 1, 2, 2), (0, 2, 1, 2), (1, 2, 0, 2), (2, 1, 2, 0), (2, 0, 2, 1),
                             (1, 2, 2, 0), (0, 2, 2, 1), (2, 1, 1, 0), (2, 1, 0, 1), (2, 1, 0, 0), (2, 0, 1, 1), (2, 0, 1, 0),
                             (2, 0, 0, 1), (1, 2, 1, 0), (1, 2, 0, 1), (0, 2, 1, 0), (1, 2, 0, 0), (0, 2, 1, 1), (0, 0, 2, 1),
                             (0, 1, 2, 1), (0, 1, 2, 1), (1, 0, 2, 1), (1, 1, 2, 0), (1, 0, 2, 0), (0, 1, 2, 0), (0, 0, 1, 2),
                             (0, 1, 0, 2), (1, 0, 0, 2), (1, 1, 0, 2), (1, 0, 1, 2), (0, 1, 1, 2)]}
#ensemble_cv = GridSearchCV(estimator = ensemble_regression, param_grid = param_ensemble , cv=5, n_jobs = 4, verbose = 2)

#ensemble_cv.fit(X_train_scaled, y_train)

ensemble_cv.best_params_


{'weights': (1, 2, 2, 0)}

In [145]:
"""Forming predictions"""

final_ensemble_regression = VotingRegressor([('lasso', res_lasso_cv), ('random_forest', rf_1), ('gradient_boosting', grad_boost), ('SVR', svr)], weights = [1,2,2,0], n_jobs = 4)
final_ensemble_regression.fit(X_train_scaled, y_train)
ensemble_regression_pred = final_ensemble_regression.predict(X_test_scaled)

ensemble_regression_1 = pd.DataFrame({'Prediction': 
ensemble_regression_pred, 'Votes': dataframe_test['votes'], 'city': dataframe_test['city'], 'party': dataframe_test['party'], 'name': dataframe_test['Name_total']})

final_ensemble_regression.score(X_train_scaled, y_train), ensemble_regression_1.head(50)

(0.9286630599047928,
       Prediction  Votes        city party                    name
 0    6530.858045   8163    Freiburg   AfD          Bernhard Lukau
 1    6816.346236   7711    Freiburg   AfD        Michael Theuring
 2    6051.535332   6504    Freiburg   AfD            Elmar Ertmer
 3    6310.016404   7703    Freiburg   AfD          Jens Bellemann
 4    5123.192437   8149    Freiburg   AfD          Martin Polheim
 5    4442.378606   4260    Freiburg   AfD         Robert Hagerman
 6    3700.756215   4721    Freiburg   AfD           Michael Braun
 7    5665.457585   6985    Freiburg   AfD             Bernd Fulde
 8    8172.968692   8931    Freiburg   AfD            Karl Schwarz
 9    5513.302510   6176    Freiburg   AfD             Jonas Möhle
 10   4332.775801   5674    Freiburg   AfD   Alessandro Bertonasco
 11   5766.065518   8008    Freiburg   AfD           Bernd Domnick
 12   7686.251733   8263    Freiburg   AfD            Jack Gelfort
 13   3968.045543   5585    Freiburg   Af

In [66]:
""" Reduced Ensemble regression"""

ensemble_regression = VotingRegressor([('lasso', res_lasso_cv), ('random_forest', rf_1), ('gradient_boosting', grad_boost), ('SVR', svr)])
red_param_ensemble = {'weights': [(1, 1, 1, 0), (1, 1, 0, 1), (1, 0, 1, 1), (0, 1, 1, 1), (1, 1, 0, 0), (1, 0, 1, 0),
                             (1, 0, 0, 1), (0, 1, 1, 0), (0, 1, 0, 1), (0, 0, 1, 1)]}

red_ensemble_cv = GridSearchCV(estimator = ensemble_regression, param_grid = red_param_ensemble , cv=5, n_jobs = 4, verbose = 2)

red_ensemble_cv.fit(X_train_scaled, y_train)

red_ensemble_cv.best_params_

{'weights': (0, 1, 1, 0)}

In [72]:
ensemble_regression = VotingRegressor([('lasso', res_lasso_cv), ('random_forest', rf_1), ('gradient_boosting', grad_boost), ('SVR', svr)])
red_param_ensemble = {'weights': [(0, 1, 1, 0), (2, 2, 2, 1)]}

red_ensemble_cv_2 = GridSearchCV(estimator = ensemble_regression, param_grid = red_param_ensemble , cv=5, n_jobs = 4, verbose = 2)

red_ensemble_cv_2.fit(X_train_scaled, y_train)

red_ensemble_cv_2.best_params_

{'weights': (0, 1, 1, 0)}

In [73]:
final_ensemble_regression_2 = VotingRegressor([('lasso', res_lasso_cv), ('random_forest', rf_1), ('gradient_boosting', grad_boost), ('SVR', svr)], weights = [0, 1, 1, 0], n_jobs = 4)
final_ensemble_regression_2.fit(X_train_scaled, y_train)
ensemble_regression_pred = final_ensemble_regression_2.predict(X_test_scaled)

ensemble_regression_2 = pd.DataFrame({'Prediction': 
ensemble_regression_pred, 'Votes': dataframe_test['votes'], 'city': dataframe_test['city'], 'party': dataframe_test['party'], 'name': dataframe_test['Name_total']})

final_ensemble_regression_2.score(X_train_scaled, y_train), ensemble_regression_2.head(50)

(0.9431958685239916,
       Prediction  Votes        city party                    name
 0    6534.314754   8163    Freiburg   AfD          Bernhard Lukau
 1    6942.136410   7711    Freiburg   AfD        Michael Theuring
 2    6004.694941   6504    Freiburg   AfD            Elmar Ertmer
 3    6255.335921   7703    Freiburg   AfD          Jens Bellemann
 4    5105.162494   8149    Freiburg   AfD          Martin Polheim
 5    4304.600197   4260    Freiburg   AfD         Robert Hagerman
 6    3371.145338   4721    Freiburg   AfD           Michael Braun
 7    5614.606900   6985    Freiburg   AfD             Bernd Fulde
 8    8713.135043   8931    Freiburg   AfD            Karl Schwarz
 9    5488.256771   6176    Freiburg   AfD             Jonas Möhle
 10   4150.257875   5674    Freiburg   AfD   Alessandro Bertonasco
 11   5667.116998   8008    Freiburg   AfD           Bernd Domnick
 12   8086.315549   8263    Freiburg   AfD            Jack Gelfort
 13   3727.013724   5585    Freiburg   Af

In [57]:
""" General function that returns the quality of our prediction"""

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score


def display_score(reg, mse_reg, r2):
    
    """ Insert explanation """
    
    reg_rmse = np.sqrt(-mse_reg)
    
    print('MSE                                  ')
    print('Scores:', reg_rmse, reg)
    print('Mean:', reg_rmse.mean(), reg)
    print('standard Deviation:', reg_rmse.std(), reg)
    print('R2                                   ')
    print('Scores:', r2, reg)
    print('Mean:', r2.mean(), reg)
    print('standard Deviation:', r2.std(), reg)

In [152]:
"""Cross-Validating the Score Measures of training set"""

mse_lasso_train = cross_val_score(res_lasso_cv, X_train_scaled, y_train, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_lasso_train = cross_val_score(res_lasso_cv, X_train_scaled, y_train, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("Lasso", mse_lasso_train, r2_lasso_train)

MSE                                  
Scores: [ 3663.59861395 19628.61658841  2796.70483513  3069.11464393
  6065.74324494  9050.80418847  2426.0731311   3519.56668207
  6280.80525216  2661.48474369] Lasso
Mean: 5916.251192385376 Lasso
standard Deviation: 4994.404655134523 Lasso
R2                                   
Scores: [ 0.78371829 -0.75050078  0.57160317  0.49800648  0.42446468  0.51381774
  0.42001559  0.61290077  0.57940586  0.52808943] Lasso
Mean: 0.4181521230818716 Lasso
standard Deviation: 0.4018113272630606 Lasso


In [153]:
""" Cross-Validating the Score measure of test set"""

mse_lasso_test = cross_val_score(res_lasso_cv, X_test_scaled, y_test, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_lasso_test = cross_val_score(res_lasso_cv, X_test_scaled, y_test, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("Lasso", mse_lasso_test, r2_lasso_test)

MSE                                  
Scores: [17356.52344979  7023.00935682  4344.72518826  8048.03579191
  7627.2860799   5382.49221688  3024.46174504  3219.12195261
  5946.92825027  3050.95263777] Lasso
Mean: 6502.353666925841 Lasso
standard Deviation: 4034.1560415395393 Lasso
R2                                   
Scores: [-7.2927101   0.82113607  0.68507558  0.04748726  0.85618446  0.8883483
  0.68901382  0.65877043  0.73271644  0.61226766] Lasso
Mean: -0.13017100977113635 Lasso
standard Deviation: 2.397972098130186 Lasso


In [154]:
""" Support Vector Regression Training Score """

mse_svr_train = cross_val_score(svr, X_train_scaled, y_train, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_svr_train = cross_val_score(svr, X_train_scaled, y_train, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("SVR", mse_svr_train, r2_svr_train)

MSE                                  
Scores: [53564.6182203  17510.12809621  8185.84358988  7545.87209823
 46053.89009838  8155.29939474  2726.54992551  3486.69907599
  6114.33225281 13170.63649877] SVR
Mean: 16651.386925081795 SVR
standard Deviation: 17160.844494895413 SVR
R2                                   
Scores: [-46.41601474  -0.3046565   -3.24490149  -2.29155243 -32.96861307
   0.62768556   0.19911129   0.69980231   0.63717313  -9.67405572] SVR
Mean: -9.273602166719616 SVR
standard Deviation: 15.786605132023318 SVR


In [155]:
""" Support Vector Regression Test Score """

mse_svr_test = cross_val_score(svr, X_test_scaled, y_test, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_svr_test = cross_val_score(svr, X_test_scaled, y_test, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("SVR", mse_svr_test, r2_svr_test)

MSE                                  
Scores: [60005.66644762  6594.34122182 20739.49075087 13087.09571946
 13688.54068833 14569.22566644  4352.71022392  7569.80308237
  6631.12045485  2995.07909131] SVR
Mean: 15023.307334698267 SVR
standard Deviation: 15866.300782367385 SVR
R2                                   
Scores: [-96.07604979   0.8237574   -6.00358986  -1.55052681   0.53048069
   0.21941247   0.40922257  -1.0677633    0.67264646   0.57088177] SVR
Mean: -10.147152838249866 SVR
standard Deviation: 28.709757102237056 SVR


In [196]:
""" Checking out the Random Forest. As usual, train data first"""

mse_rf_train = cross_val_score(rf_1, rand_for_train_scaled, y_train, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_rf_train = cross_val_score(rf_1, rand_for_train_scaled, y_train, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("RF", mse_rf_train, r2_rf_train)

MSE                                  
Scores: [ 3370.15618287  8261.69326749  4047.38385773  2665.66380868
  5771.10254905  9584.60388166  1912.01015909  2721.57484861
 10165.6246253   2609.23569527] RF
Mean: 5110.9048875737235 RF
standard Deviation: 2970.9958586040125 RF
R2                                   
Scores: [ 0.80936854  0.6731196   0.10496928  0.61563711  0.47599266  0.44001036
  0.60309173  0.76429308 -0.22533435  0.55695775] RF
Mean: 0.4818105776172928 RF
standard Deviation: 0.30096509751675077 RF


In [197]:
""" Here, we consider the test data for the Random Forest"""

mse_rf_test = cross_val_score(rf_1, rand_for_test_scaled, y_test, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_rf_test = cross_val_score(rf_1, rand_for_test_scaled, y_test, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("RF", mse_rf_test, r2_rf_test)

MSE                                  
Scores: [10213.16517969 10378.0727881   2984.09451997  7710.38086063
 12200.21689223  6239.73409206  2668.40500057  4488.51958774
  5264.20880652  3259.37226337] RF
Mean: 6540.61699908741 RF
standard Deviation: 3255.0482733559948 RF
R2                                   
Scores: [-1.29165285  0.59603696  0.83696084  0.13797658  0.61928616  0.83921672
  0.75989696  0.35707118  0.78735407  0.55820468] RF
Mean: 0.42003512961178063 RF
standard Deviation: 0.6085339091829746 RF


In [158]:
""" Gradient Boosting"""

mse_gb_train = cross_val_score(grad_boost, X_train_scaled, y_train, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_gb_train = cross_val_score(grad_boost, X_train_scaled, y_train, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("Gradient Boosting", mse_gb_train, r2_gb_train)

MSE                                  
Scores: [2835.99273453 8797.56061668 2598.19260953 2476.22606351 5668.4506034
 8726.43573711 1922.99695068 2258.14473168 4991.98868083 2299.0724652 ] Gradient Boosting
Mean: 4257.506119315109 Gradient Boosting
standard Deviation: 2536.6016123291806 Gradient Boosting
R2                                   
Scores: [0.87039724 0.64835256 0.63026065 0.67322226 0.49738822 0.5480415
 0.6356106  0.84065169 0.73430722 0.64785862] Gradient Boosting
Mean: 0.6726090560486385 Gradient Boosting
standard Deviation: 0.11033956076050566 Gradient Boosting


In [159]:
""" GB: test data"""

mse_gb_test = cross_val_score(grad_boost, X_test_scaled, y_test, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_gb_test = cross_val_score(grad_boost, X_test_scaled, y_test, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("Gradient Boosting", mse_gb_test, r2_gb_test)

MSE                                  
Scores: [6598.68648349 7861.73255229 3910.43733371 7051.05520693 7402.35112187
 5924.54157562 3201.2327585  3162.62998626 6450.85037935 3047.80846545] Gradient Boosting
Mean: 5461.132586346712 Gradient Boosting
standard Deviation: 1821.1653445575541 Gradient Boosting
R2                                   
Scores: [-0.19863235  0.77586338  0.74488712  0.26886222  0.86454186  0.86472797
  0.65159899  0.67064173  0.6854999   0.6130664 ] Gradient Boosting
Mean: 0.5941057227397042 Gradient Boosting
standard Deviation: 0.30890704222558873 Gradient Boosting


In [160]:
""" Ensemble Regression with Training Data"""

mse_er_train = cross_val_score(final_ensemble_regression, X_train_scaled, y_train, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_er_train = cross_val_score(final_ensemble_regression, X_train_scaled, y_train, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("Ensemble Regression", mse_er_train, r2_er_train)

MSE                                  
Scores: [2645.00902907 9049.95474667 2558.1216969  2562.574041   5669.40501072
 5954.85363946 2010.87487667 2282.14642394 5502.74865191 2310.06798041] Ensemble Regression
Mean: 4054.575609673228 Ensemble Regression
standard Deviation: 2237.9364195585285 Ensemble Regression
R2                                   
Scores: [0.88766635 0.62523138 0.63210663 0.64909959 0.49756902 0.78320267
 0.61400487 0.83797359 0.67705152 0.64567969] Ensemble Regression
Mean: 0.684958532154825 Ensemble Regression
standard Deviation: 0.11115623945338213 Ensemble Regression


In [161]:
""" Ensemble Regression with Test Data"""

mse_er_test = cross_val_score(final_ensemble_regression, X_test_scaled, y_test, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_er_test = cross_val_score(final_ensemble_regression, X_test_scaled, y_test, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("Ensemble Regression", mse_er_test, r2_er_test)

MSE                                  
Scores: [11773.43128437  8520.31233556  4311.41790932  7365.93560315
  8013.33141934  4833.62878778  3438.85984179  3085.17240197
  5871.76420383  3151.9337471 ] Ensemble Regression
Mean: 6036.578753420143 Ensemble Regression
standard Deviation: 2701.5866154234104 Ensemble Regression
R2                                   
Scores: [-2.81599654  0.73527128  0.70195935  0.2046823   0.83175279  0.90730482
  0.59525322  0.68032597  0.74234957  0.59570276] Ensemble Regression
Mean: 0.317860553195191 Ensemble Regression
standard Deviation: 1.0598527275862153 Ensemble Regression


In [75]:
mse_er2_train = cross_val_score(final_ensemble_regression_2, X_train_scaled, y_train, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_er2_train = cross_val_score(final_ensemble_regression_2, X_train_scaled, y_train, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("reduced Ensemble Regression", mse_er2_train, r2_er2_train)

MSE                                  
Scores: [2602.80962658 9226.51180934 2846.45399533 2531.92918332 5689.22553706
 5383.86034975 2089.52054822 2223.4765704  6275.46174942 2373.19565419] reduced Ensemble Regression
Mean: 4124.244502361255 reduced Ensemble Regression
standard Deviation: 2280.325374669567 reduced Ensemble Regression
R2                                   
Scores: [0.89068069 0.61576045 0.55987285 0.65649924 0.48972129 0.82861354
 0.56627912 0.84047779 0.59432696 0.60861269] reduced Ensemble Regression
Mean: 0.6650844626231208 reduced Ensemble Regression
standard Deviation: 0.13064635766948673 reduced Ensemble Regression


In [76]:
mse_er2_test = cross_val_score(final_ensemble_regression_2, X_test_scaled, y_test, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_er2_test = cross_val_score(final_ensemble_regression_2, X_test_scaled, y_test, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("Reduced Ensemble Regression", mse_er2_test, r2_er2_test)

MSE                                  
Scores: [10682.07571352  9693.71710647  4914.2259956   7247.84658337
  8552.53374941  4990.51017039  3750.44909851  3283.60570279
  6072.11149061  3226.7805082 ] Reduced Ensemble Regression
Mean: 6241.385611888166 Reduced Ensemble Regression
standard Deviation: 2557.116679170166 Reduced Ensemble Regression
R2                                   
Scores: [-2.22905035  0.66108981  0.61101546  0.228594    0.80901921  0.90592576
  0.557606    0.63329608  0.72286228  0.56475755] Reduced Ensemble Regression
Mean: 0.34651157893061696 Reduced Ensemble Regression
standard Deviation: 0.8751731517625211 Reduced Ensemble Regression


In [83]:
""" To complete things, the linear regression with the reduced coefficients """

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(rand_for_train_scaled, y_train)
lin_pred = lin_reg.predict(rand_for_test_scaled)

mse_linreg_train = cross_val_score(lin_reg, rand_for_train_scaled, y_train, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_linreg_train = cross_val_score(lin_reg, rand_for_train_scaled, y_train, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("Linear Regression", mse_linreg_train, r2_linreg_train)

MSE                                  
Scores: [184682.56322492  15399.38175936   4138.69680795   6666.16576312
  30327.23402986   9541.28372463   3190.64633222   4376.62534453
   7003.48086228   4605.92100565] Linear Regression
Mean: 26993.199885452275 Linear Regression
standard Deviation: 53132.29134859821 Linear Regression
R2                                   
Scores: [-5.48611495e+02 -7.74310736e-02  6.18328846e-02 -1.36823124e+00
 -1.33869766e+01  4.59695741e-01 -3.15000431e-03  4.01419383e-01
  4.77049651e-01 -4.13333697e-01] Linear Regression
Mean: -56.24606195961583 Linear Regression
standard Deviation: 164.17071635437767 Linear Regression


In [191]:
import statsmodels.api as smf
results = smf.OLS(y_train, rand_for_train_scaled).fit()
print(results.summary())

                                 OLS Regression Results                                
Dep. Variable:                  votes   R-squared (uncentered):                   0.354
Model:                            OLS   Adj. R-squared (uncentered):              0.352
Method:                 Least Squares   F-statistic:                              137.8
Date:                Thu, 18 Jul 2019   Prob (F-statistic):                        0.00
Time:                        12:56:36   Log-Likelihood:                         -79439.
No. Observations:                7562   AIC:                                  1.589e+05
Df Residuals:                    7532   BIC:                                  1.591e+05
Df Model:                          30                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [163]:
mse_linreg_test = cross_val_score(lin_reg, rand_for_test_scaled, y_test, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_linreg_test = cross_val_score(lin_reg, rand_for_test_scaled, y_test, scoring = 'r2', cv = 10, n_jobs = -1)
display_score("Linear Regression", mse_linreg_test, r2_linreg_test)

MSE                                  
Scores: [3.68000176e+14 7.92680148e+09 1.80502367e+14 1.95424353e+16
 3.65120498e+16 2.61029003e+14 3.65448952e+03 5.16701358e+03
 7.86512468e+03 6.26541240e+03] Linear Regression
Mean: 5686402456527141.0 Linear Regression
standard Deviation: 1.179797003958671e+16 Linear Regression
R2                                   
Scores: [-3.72792405e+21 -2.27862177e+11 -5.43560040e+20 -5.61627496e+24
 -3.29562792e+24 -2.62588703e+20  5.45955575e-01  1.20874239e-01
  5.32482282e-01 -6.35160635e-01] Linear Regression
Mean: -8.916436951175416e+23 Linear Regression
standard Deviation: 1.85616371198575e+24 Linear Regression


In [148]:
""" Merge all predictions in one dataset """

all_pred = pd.DataFrame({'Ensemble Prediction': ensemble_regression_pred, 'Support Vector Prediction': svr_pred,
                         'Gradient Boosting Prediction': gradient_boosting_pred, 'Random Forest Prediction': random_forest_pred,
                         'Lasso Prediction': lasso_comp_pred, 'Linear Regression Prediction': lin_pred,
                         'Votes': dataframe_test['votes'], 'city': dataframe_test['city'], 'party': dataframe_test['party'], 'name': dataframe_test['Name_total']})

all_pred.to_excel(r'C:\Users\mariu\Desktop\Project\all_predictions.xlsx')

In [183]:
""" All Features/Coefficients in one dataset """

coefficients_lasso = pd.DataFrame({'Variable': X2, 'Coefficients Lasso': res_lasso_cv.coef_, 
                                   'Coefficients Gradient Boosting': grad_boost.feature_importances_,
                                   'Coefficients SVM': svr.coef_ ,
                                  }) # Create a dataset with the estimated coefficients

coefficients_lasso.to_excel(r'C:\Users\mariu\Desktop\Project\Coefficients_Estimators.xlsx')

list_coefficients = pd.DataFrame({'Variable': X2_rand_for, 'Coefficients RF': feature_importances_coef_comp,
                                 'Coefficients LR': lin_reg.coef_,
                                 }) # Create a dataset with the estimated coefficients - only the core variables (used for RF and LR)

list_coefficients.to_excel(r'C:\Users\mariu\Desktop\Project\Coefficients_Random_Forest_Lin_Reg.xlsx')


In [162]:
""" Evaluate on party seats for each estimator """

import statsmodels.tools.tools as sm

dataframe_seats = pd.read_excel(r'C:\Users\mariu\Desktop\Project\Evaluation_seats.xlsx')

estimators = ['Ensemble', 'Gradient Boosting', 'Random Forest', 'Lasso', 'Linear Regression', 'Support Vector Machine']

score = []

for i in range(len(estimators)):
    
    estimator = estimators[i]
    data_estimator = dataframe_seats[dataframe_seats['Estimator'].str.match(estimator)]
    
    X_seats = data_estimator['Predicted Seats']
    y_seats = data_estimator['True Seats']
    X = sm.add_constant(X_seats)
    
    mse_eval_seats_estimator = cross_val_score(lin_reg, X, y_seats, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
    r2_eval_seats_estimator= cross_val_score(lin_reg, X, y_seats, scoring = 'r2', cv = 10, n_jobs = -1)
    display_score(estimator, mse_eval_seats_estimator, r2_eval_seats_estimator)
  

MSE                                  
Scores: [1.53323263 0.69730152 2.51026109 2.76513724 1.23844075 1.70881249
 0.80096218 0.21982417 1.34298295 0.67475326] Ensemble
Mean: 1.3491708281454362 Ensemble
standard Deviation: 0.7735852252252899 Ensemble
R2                                   
Scores: [0.82689547 0.82876268 0.69326066 0.59521262 0.76822281 0.83390221
 0.97929471 0.98982681 0.79819824 0.75513709] Ensemble
Mean: 0.8068713298483136 Ensemble
standard Deviation: 0.11259021000512717 Ensemble
MSE                                  
Scores: [1.37076549 1.07625199 2.53274236 2.05158662 1.22519696 2.29570449
 2.20362547 1.14834725 1.50147623 0.60329823] Gradient Boosting
Mean: 1.6008995095973588 Gradient Boosting
standard Deviation: 0.5997289762539889 Gradient Boosting
R2                                   
Scores: [0.86163742 0.59207049 0.68774189 0.77717018 0.77315352 0.700217
 0.84327697 0.72237865 0.74775599 0.8042521 ] Gradient Boosting
Mean: 0.7509654209505499 Gradient Boosting
stan

In [None]:
import plotly.plotly as py
import plotly.graph_objs as go
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.tools as tls

tools.set_credentials_file(username='marius92', api_key='4naxu3XiGZTGniWO02z1')

In [149]:
""" Make a nice graph """

dataframe_seats = pd.read_excel(r'C:\Users\mariu\Desktop\Project\Evaluation_seats.xlsx')

estimators = ['Ensemble', 'Gradient Boosting', 'Random Forest', 'Lasso', 'Linear Regression', 'Support Vector Machine']

colors = ['#0D76BF', '#43B02A', '#F93822', '#3E332E', '#FFD700', '#D62598', '#8D3921', '#F06400', '#c1c0c0', '#c1c0c0', '#c1c0c0',
         '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0',
         '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0',
         '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0',
         '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0', '#c1c0c0']

parties = ['AfD', 'Grüne', 'SPD', 'CDU', 'FDP', 'Linke', 'Partei', 'FW', 'Piraten', 'UFF', 'Junges Freiburg',
          'Bürger für Freiburg', 'Urbanes Freiburg', ' FL', 'Für Freiburg', 'GA Freiburg', 'Liste Teilhabe', 'nicht',
          'Urbanes Freiburg', 'GA Heidelberg', 'Bunte Linke', 'heidelberger', 'HiB', 'Für KA', 'KAL', 'BIG', 'Mannheimer Volkspartei',
          'Mittelstand für Mannheim', 'Tierschutzpartei', 'BZS 23', 'DIB', 'Fem. Liste', 'Junges Liste', 'Kein Fahrverbot',
          'öpd', 'sös', 'SchUB', 'Stadtisten', 'BLO', 'UfA', 'UVL', 'WWG', 'UWS']



for i in range(len(estimators)):
    
    estimator = estimators[i]
    
    data_estimator = dataframe_seats[dataframe_seats['Estimator'].str.match(estimator)]
        
    X_seats = data_estimator[['Predicted Seats', 'Party']]
    y_seats = data_estimator[['True Seats', 'Party']]
    
    data = []
    
    for i in range(len(parties)):
        
        party_now = parties[i]
        color_now = colors[i]
        X_data = X_seats[X_seats['Party'].str.match(party_now)]
        y_data = y_seats[y_seats['Party'].str.match(party_now)]
        
        trace = go.Scatter(
            x = X_data['Predicted Seats'],
            y = y_data['True Seats'],
            name = party_now,
            mode='markers',
            marker=dict(
                color=color_now,
                size=12,
                line=dict(
                    color='rgba(217, 217, 217, 0.14)',
                    width=0.5),
                opacity=0.8)
        )
        data.append(trace)
    
    trace_1 = go.Scatter(
        x = [0, 5, 10, 16],
        y = [0, 5, 10, 16],
        name = 'Fit',
        mode = 'lines',
        showlegend=False,
        marker = dict(
            color = 'black')
    )

    data.append(trace_1)

    filename = estimator
    
    layout = go.Layout(
        legend=dict(orientation="h")
    )
    fig = dict(data = data, layout = layout)
    fig['layout'].update(width = 600, height = 600)
    fig['layout'].update(title = filename, showlegend = False)
    fig['layout'].update(xaxis = dict(
                    title = 'Predicted Seats'))
    fig['layout'].update(yaxis = dict(
                    title = 'Actual Seats',
                    tickmode = 'auto'))

    py.iplot(fig, filename = filename)
    


Consider using IPython.display.IFrame instead



In [170]:
X_line = [1,2,3,4,5,6,7,8,9,10]

data_line = []

trace_1 = go.Scatter(
    x = X_line,
    y = r2_lasso_test,
    name = 'R2 Lasso',
    mode = 'lines',
    marker = dict(
        color = 'purple')
)

data_line.append(trace_1)

trace_2 = go.Scatter(
    x = X_line,
    y = r2_svr_test,
    name = 'R2 SVM',
    mode = 'lines',
    marker = dict(
        color = 'black')
)

#data_line.append(trace_2)

trace_3 = go.Scatter(
    x = X_line,
    y = r2_rf_test,
    name = 'R2 Random Forest',
    mode = 'lines',
    marker = dict(
        color = 'green')
)

data_line.append(trace_3)

trace_4 = go.Scatter(
    x = X_line,
    y = r2_gb_test,
    name = 'R2 Gradient Boosting',
    mode = 'lines',
    marker = dict(
        color = 'blue')
)

data_line.append(trace_4)

trace_5 = go.Scatter(
    x = X_line,
    y = r2_er_test,
    name = 'R2 Ensemble',
    mode = 'lines',
    marker = dict(
        color = 'red')
)

data_line.append(trace_5)

trace_6 = go.Scatter(
    x = X_line,
    y = r2_linreg_test,
    name = 'R2 Linear Regression',
    mode = 'lines',
    marker = dict(
        color = 'black')
)

#data_line.append(trace_6)


filename = 'R2_path'
    
layout = go.Layout(
    legend=dict(orientation="h")
)
fig = dict(data = data_line, layout = layout)
fig['layout'].update(width = 600, height = 600)
fig['layout'].update(title = filename)
fig['layout'].update(xaxis = dict(
                title = 'Part of Cross Validation'))
fig['layout'].update(yaxis = dict(
                title = 'R2 Score',
                tickmode = 'auto'))

py.iplot(fig, filename = filename)

In [179]:

data_estimator = dataframe_seats[dataframe_seats['Estimator'].str.match('Linear Regression')]
X_seats = data_estimator['Predicted Seats']
y_seats = data_estimator['True Seats']
X = sm.add_constant(X_seats)
mse_eval_seats_lin_reg = cross_val_score(lin_reg, X, y_seats, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_eval_seats_lin_reg = cross_val_score(lin_reg, X, y_seats, scoring = 'r2', cv = 10, n_jobs = -1)

X_line = [1,2,3,4,5,6,7,8,9,10]

data_line = []

trace_1 = go.Scatter(
    x = X_line,
    y = r2_eval_seats_lin_reg,
    name = 'R2 Linear Regression',
    mode = 'lines',
    marker = dict(
        color = 'purple')
)

data_line.append(trace_1)

data_estimator = dataframe_seats[dataframe_seats['Estimator'].str.match('Lasso')]
X_seats = data_estimator['Predicted Seats']
y_seats = data_estimator['True Seats']
X = sm.add_constant(X_seats)
mse_eval_seats_lasso = cross_val_score(lin_reg, X, y_seats, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_eval_seats_lasso = cross_val_score(lin_reg, X, y_seats, scoring = 'r2', cv = 10, n_jobs = -1)


trace_2 = go.Scatter(
    x = X_line,
    y = r2_eval_seats_lasso,
    name = 'R2 Lasso',
    mode = 'lines',
    marker = dict(
        color = 'black')
)

data_line.append(trace_2)

data_estimator = dataframe_seats[dataframe_seats['Estimator'].str.match('Support Vector Machine')]
X_seats = data_estimator['Predicted Seats']
y_seats = data_estimator['True Seats']
X = sm.add_constant(X_seats)
mse_eval_seats_svm = cross_val_score(lin_reg, X, y_seats, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_eval_seats_svm = cross_val_score(lin_reg, X, y_seats, scoring = 'r2', cv = 10, n_jobs = -1)

trace_3 = go.Scatter(
    x = X_line,
    y = r2_eval_seats_svm ,
    name = 'R2 Support Vector Machine',
    mode = 'lines',
    marker = dict(
        color = 'green')
)

data_line.append(trace_3)

data_estimator = dataframe_seats[dataframe_seats['Estimator'].str.match('Random Forest')]
X_seats = data_estimator['Predicted Seats']
y_seats = data_estimator['True Seats']
X = sm.add_constant(X_seats)
mse_eval_seats_rf = cross_val_score(lin_reg, X, y_seats, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_eval_seats_rf = cross_val_score(lin_reg, X, y_seats, scoring = 'r2', cv = 10, n_jobs = -1)

trace_4 = go.Scatter(
    x = X_line,
    y = r2_eval_seats_rf,
    name = 'R2 Random Forest',
    mode = 'lines',
    marker = dict(
        color = 'blue')
)

data_line.append(trace_4)

data_estimator = dataframe_seats[dataframe_seats['Estimator'].str.match('Gradient Boosting')]
X_seats = data_estimator['Predicted Seats']
y_seats = data_estimator['True Seats']
X = sm.add_constant(X_seats)
mse_eval_seats_gb = cross_val_score(lin_reg, X, y_seats, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_eval_seats_gb = cross_val_score(lin_reg, X, y_seats, scoring = 'r2', cv = 10, n_jobs = -1)

trace_5 = go.Scatter(
    x = X_line,
    y = r2_eval_seats_gb,
    name = 'R2 Gradient Boosting',
    mode = 'lines',
    marker = dict(
        color = 'red')
)

data_line.append(trace_5)

data_estimator = dataframe_seats[dataframe_seats['Estimator'].str.match('Ensemble')]
X_seats = data_estimator['Predicted Seats']
y_seats = data_estimator['True Seats']
X = sm.add_constant(X_seats)
mse_eval_seats_er = cross_val_score(lin_reg, X, y_seats, scoring = 'neg_mean_squared_error', cv = 10, n_jobs = -1)
r2_eval_seats_er = cross_val_score(lin_reg, X, y_seats, scoring = 'r2', cv = 10, n_jobs = -1)

trace_6 = go.Scatter(
    x = X_line,
    y = r2_eval_seats_er,
    name = 'R2 Ensemble',
    mode = 'lines',
    marker = dict(
        color = 'orange')
)

data_line.append(trace_6)


filename = 'R2_path_seats'
    
layout = go.Layout(
    legend=dict(orientation="h")
)
fig = dict(data = data_line, layout = layout)
fig['layout'].update(width = 600, height = 600)
fig['layout'].update(title = filename)
fig['layout'].update(xaxis = dict(
                title = 'Part of Cross Validation'))
fig['layout'].update(yaxis = dict(
                title = 'R2 Score Seats',
                tickmode = 'auto'))

py.iplot(fig, filename = filename)

In [180]:
coefficients_lasso

Unnamed: 0,Variable,Coefficient
0,place_list,-325.390411
1,incumbent,0.000000
2,woman,0.000000
3,doctor,0.000000
4,time,-0.000000
5,federal_election,0.000000
6,aristocracy,-0.000000
7,google_stan,-0.000000
8,population,-0.000000
9,share_students,-0.000000


In [198]:
tls.get_embed('https://plot.ly/~marius92/54')

'<iframe id="igraph" scrolling="no" style="border:none;" seamless="seamless" src="https://plot.ly/~marius92/54.embed" height="525" width="100%"></iframe>'