# Imports and settings

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 16,8

In [None]:
import numpy as np
import pandas as pd

In [None]:
import os, sys, inspect

script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
# add the path to opengrid to sys.path
sys.path.append(os.path.join(script_dir, os.pardir, os.pardir))

#Load mock Data

In [None]:
date = [1388534400000, 1391212800000, 1393632000000, 1396310400000, 1398902400000, 1401580800000, 1404172800000, 1406851200000, 1409529600000, 1412121600000, 1414800000000, 1417392000000] #eerste dag van de maand in 2014
y = [93906.1, 74794.7, 61266.9, 24086.9, 20694.7, 1321.7, 0, 0, 0, 23711.2, 53668.7, 148793.7] #gas in kWh
x1 = [321, 277, 228, 121, 99, 26, 6, 35, 23, 92, 224, 376] #graaddagen
x2 = [21, 20, 22, 21, 19, 21, 22, 20, 22, 22, 20, 21] #weekdagen
x3 = [10, 8, 9, 9, 12, 9, 9, 11, 8, 9, 10, 10] #weekenddagen
x4 = [-20.3, -6.4, -29.3, -3.6, 0, 0, 0, 0, 0, 0, -7.3, -125.3] #vorst

In [None]:
#convert unix timestamp to datetimeIndex
date = [x/1000 for x in date]
date = np.array(date).astype('datetime64[s]')
date = pd.DatetimeIndex(date)

In [None]:
dict = {
    'gas': pd.Series(y, index=date),
    'graaddagen': pd.Series(x1, index=date),
    'weekdagen': pd.Series(x2, index=date),
    'weekenddagen': pd.Series(x3, index=date),
    'vorst': pd.Series(x4, index=date)
}

In [None]:
df = pd.DataFrame(dict)

In [None]:
for column in df.columns:
    plt.figure()
    df[column].plot(title='{}'.format(column))

#Define Classes

In [None]:
from opengrid.library.analyses.analysis import Analysis

In [None]:
import statsmodels.formula.api as sm

class MultivariateLinearRegression(Analysis):
    def __init__(self, data, dependentVariable):
        """
            Calculate an Ordinary Least Squares Regression on a dataset with multiple variables
            
            Parameters
            ----------
            data: Pandas Dataframe
            dependentVariable: String
                name of the dependent variable. This will be the y value, all other columns will be used as x values
        """
        self.data = data
        self.dependentVariable = dependentVariable
        
        #select all column names that are not the dependent variable
        variables = [name for name in data.columns.tolist() if name != dependentVariable][::-1]
        
        self.result = self._runOLS(data=self.data, dependentVariable=dependentVariable, otherVariables=variables)
        
    def _constructFormula(self, dependentVariable, variables):
        """
            Take a dependent variable y and list of variables and concatenate them into
            "y ~ var1 + var2 + var3"
            
            Parameters
            ----------
            dependentVariable: String
            variables: list of strings
            
            Returns
            -------
            string
        """
        rhs = variables.pop()
        while len(variables) > 0:
            rhs += " + {}".format(variables.pop())
            
        formula = "{} ~ {}".format(dependentVariable, rhs)
            
        return formula
    
    def _runOLS(self, data, dependentVariable, otherVariables):
        """
            Construct the formula and run the OLS
            
            Parameters
            ----------
            data: Pandas Dataframe
            dependentVariable: String
            otherVariables: list of strings
        """
        #The OLS calculation takes a formula of the form "y ~ x1 + x2"
        formula = self._constructFormula(dependentVariable,otherVariables)        
        
        return sm.ols(formula=formula,data=data).fit()
        
    def getOLSWithSignificantVariables(self,pvalueLimit=0.05):
        """
            Re-run the OLS but only with the variables where the pValue is below the pvalueLimit (standard 0.05)
            which means that the variable is statistically significant
            
            Parameters
            ----------
            pvalueLimit: float
            
            Returns
            -------
            sm.ols.fit
        """
        #get variables that are significant
        variables = self.getSignificantVariables(pvalueLimit)
        
        return self._runOLS(data=self.data, dependentVariable=self.dependentVariable, otherVariables=variables)
        
    def getSignificantVariables(self,pvalueLimit=0.05):
        """
            Return the names of the columns where the pValue is below the pvalueLimit (standard 0.05),
            meaning that the variable is statiscally significant
            
            Parameters
            ----------
            pvalueLimit: float

            Returns
            -------
            list of strings
        """
        #iterate all pvalues, ignore intercept, return names where the value is smaller than pvaluelimit
        return [name for name,value in self.result.pvalues.iteritems() if name != 'Intercept' and value < pvalueLimit]

#Run Test

##First with all variables

In [None]:
MVLR = MultivariateLinearRegression(df, dependentVariable='gas')

In [None]:
MVLR.result.params

In [None]:
MVLR.result.summary()

The P-value for week days and weekend days is pretty high, degree days and frost seem to be statistically significant

##Repeat with Degreedays and Frost only

In [None]:
MVLR.getSignificantVariables()

In [None]:
D2 = MVLR.getOLSWithSignificantVariables()

In [None]:
D2.summary()

#Plot Models

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot_date(df.index, df['gas'],'-',label='gas')
ax1.plot_date(df.index, MVLR.result.fittedvalues, '-', label='model with 4 variables')
ax1.plot_date(df.index, D2.fittedvalues, '-', label='model with only degreedays & frost')

plt.legend()