In [1]:
%matplotlib notebook
%reset -f


import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.feature_selection import VarianceThreshold

In [2]:
def regress(X,Y):
    X_cp = sm.add_constant(X)
    model = sm.OLS(Y,X_cp)
    results = model.fit()
    return results


def variance_threshold_selector(data, threshold):
    selector = VarianceThreshold(threshold)
    selector.fit(data)
    return data[data.columns[selector.get_support(indices=True)]]

In [3]:
#Importing excel file and formatting as a pandas dataframe
db = pd.read_excel('base_reduced1.xlsx')
db.set_index("Country")
db.isnull().sum().sum()

0

In [4]:
#Excluding 'countries' and redefining columns
columns = list(db.columns)
#Correction
db = db.reindex(columns=columns)

In [5]:
new_columns = list(db.columns)
new_columns.remove('Country')
#Redefining dataframe
df = db[new_columns]

In [6]:
#Defining regression variables
Y = df['VDEM']
new_columns.remove('VDEM')
X = df[new_columns]

In [7]:
# Implementation of first feature selection method: variance treshold

X = variance_threshold_selector(X, .85 * (1 - .85))
# If line has over 85% of equal answers, we can exclude as it explains nothing

In [8]:
dici = {}
questions = []
questionsCoefficients = []
questionsTtests = []
for c in X.columns:
    clf = regress(X[c],Y)
    questions.append(c)
    questionsCoefficients.append(dict(clf.params))
    questionsTtests.append(dict(clf.summary2().tables[1]['P>|t|']))

dici['Questions'] = questions
dici['Coefficients'] = questionsCoefficients
dici['P>|t|'] = questionsTtests
frame = pd.DataFrame.from_dict(dici)



In [9]:
aggr = {}
dici2 = {}
questions = []
questionsCoefficients = []
questionsTtests = []

for e in dici['Coefficients']:
    aggr.update(e)
for e in aggr:
    questionsCoefficients.append(aggr[e])
    questions.append(e)
    
aggr = {}
for e in dici['P>|t|']:
    aggr.update(e)
for e in aggr:
    questionsTtests.append(aggr[e])

    
dici2['Questions'] = questions
dici2['Coefficient'] = questionsCoefficients
dici2['P>|t|'] = questionsTtests
frame2 = pd.DataFrame.from_dict(dici2)


#frame2 = frame2[frame2['P>|t|']<0.10]
frame2 = frame2.sort_values(by='Coefficient', ascending=False)
frame2 = frame2.head(35)
new_columns = list(frame2['Questions'])
new_columns.remove("const")
X = X[new_columns]
len(frame2)

35

In [10]:
#colunas.remove("16.2 - Yes, for legal persons")
X = X[new_columns]

In [11]:
results = regress(X,Y)
# Regression with selected features
results.summary()

0,1,2,3
Dep. Variable:,VDEM,R-squared:,0.997
Model:,OLS,Adj. R-squared:,0.94
Method:,Least Squares,F-statistic:,17.72
Date:,"Tue, 28 Jul 2020",Prob (F-statistic):,0.0548
Time:,13:36:41,Log-Likelihood:,121.5
No. Observations:,37,AIC:,-173.0
Df Residuals:,2,BIC:,-116.6
Df Model:,34,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.4592,0.316,-1.454,0.283,-1.818,0.900
"8.1 - No, but specific limit",-0.1887,0.069,-2.727,0.112,-0.487,0.109
"16.1 - Yes, for natural persons",0.3848,0.086,4.486,0.046,0.016,0.754
31.3 - Previous election,0.2776,0.058,4.769,0.041,0.027,0.528
"7.2 - No, but specifc limit",0.2759,0.063,4.391,0.048,0.006,0.546
29,-0.1108,0.034,-3.287,0.081,-0.256,0.034
35.3 - Share of votes,0.0007,0.044,0.015,0.989,-0.188,0.189
"20.2 - Yes, for natural persons",0.2350,0.091,2.592,0.122,-0.155,0.625
37.2 - Tax Relief,0.2343,0.073,3.200,0.085,-0.081,0.549

0,1,2,3
Omnibus:,0.047,Durbin-Watson:,2.186
Prob(Omnibus):,0.977,Jarque-Bera (JB):,0.153
Skew:,0.076,Prob(JB):,0.926
Kurtosis:,2.724,Cond. No.,231.0


In [12]:

def autoFitter(x, y):
    results = regress(x,y)
    
    questionsTtests = []
    questionsTtests.append(dict(results.summary2().tables[1]['P>|t|']))
    
    listValues = []
    listQuestions = []
    for e in questionsTtests[0]:
        listValues.append(questionsTtests[0][e])
        listQuestions.append(e)

    if max(listValues) > 0.10:
        maxValueIndex = listValues.index(max(listValues))
        question = listQuestions[maxValueIndex]
        listQuestions.remove(question)
        listQuestions.remove('const')
        x = x[listQuestions]

        autoFitter(x, y)
    else:
        print(results.summary())
        return results

autoFitter(X,Y)

                            OLS Regression Results                            
Dep. Variable:                   VDEM   R-squared:                       0.996
Model:                            OLS   Adj. R-squared:                  0.977
Method:                 Least Squares   F-statistic:                     51.27
Date:                Tue, 28 Jul 2020   Prob (F-statistic):           3.83e-05
Time:                        13:36:41   Log-Likelihood:                 118.52
No. Observations:                  37   AIC:                            -175.0
Df Residuals:                       6   BIC:                            -125.1
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                                             coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------