In [14]:
import itertools
import pyodbc
from IPython.core.display import display
from statsmodels.tsa.stattools import kpss
from Query import Query
from configuration import Configuration
from statsmodels.tsa.arima.model import ARIMA
import matplotlib.pyplot as plt
import pandas as pd
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 0)
quantidadeSubSets = int( Configuration.getConfigValue('QUESTION5_METRIC1_SUBSSETS'))
from sklearn.metrics import mean_absolute_percentage_error

#Referencias de publicações sobre a utilização de bibliotecas python para series temporais.
#https://towardsdatascience.com/arima-forecasting-in-python-90d36c2246d3
#https://machinelearningmastery.com/backtest-machine-learning-models-time-series-forecasting/


def melhor_aic(data):
    # Grid Search Best AIC
    p = d = q = range(0,8) # p, d, and q can be either 0, 1, or 2
    pdq = list(itertools.product(p,d,q)) # gets all possible combinations of p, d, and q
    combs = {} # stores aic and order pairs
    aics = [] # stores aics
    # Grid Search continued
    for combination in pdq:
        try:
            model = ARIMA(data, order=combination) # create all possible models
            model = model.fit()
            combs.update({model.aic : combination   }) # store combinations
            aics.append(model.aic)
        except:
           continue
    display(aics)
    best_aic = min(aics)
    display("Best AIC: {0} {1}".format(best_aic, combs[best_aic]))
    return combs[best_aic]

def testar_estacionaridade(timeseries):
    adf_test(timeseries)
    kpss_test(timeseries)

def adf_test(series):
     #Perform Dickey-Fuller test:
    display('Results of Dickey-Fuller Test:')
    dftest = adfuller(series, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    display(dfoutput)
    #O valor p é maior do que o nível de significância 0,05,
    # por isso não é estacionário e diferenciamento é como tal necessário,
    # ou seja. d > 0.
    display(f'Result adf : The series is {"not " if dftest[1] > 0.05 else ""} stationary')
    if dftest[0] > dftest[4]["5%"]:
        display ("Failed to Reject Ho - Time Series is Non-Stationary")
    else:
        display ("Reject Ho - Time Series is Stationary")


def kpss_test(series):
    statistic, p_value, n_lags, critical_values = kpss(series, regression='c', nlags="auto")
    # Format Output
    display('Results of KPSS Test:')
    display(f'KPSS Statistic: {statistic}')
    display(f'p-value: {p_value}')
    display(f'num lags: {n_lags}')
    display('Critial Values:')
    for key, value in critical_values.items():
        display(f'   {key} : {value}')
    display(f'Result kpss_test: The series is {"not " if p_value < 0.05 else ""} stationary')


def forecast_accuracy(forecast, actual):
    mapeb = mean_absolute_percentage_error(actual, forecast)
    display({'mape':mapeb})
    #mape = np.mean(np.abs(forecast - actual)/np.abs(actual))  # MAPE
    #me = np.mean(forecast - actual)             # ME
    #mae = np.mean(np.abs(forecast - actual))    # MAE
    #mpe = np.mean((forecast - actual)/actual)   # MPE
    #rmse = np.mean((forecast - actual)**2)**.5  # RMSE
    #corr = np.corrcoef(forecast, actual)[0,1]   # corr
    #mins = np.amin(np.hstack([forecast[:,None],                         actual[:,None]]), axis=1)
    #maxs = np.amax(np.hstack([forecast[:,None],actual[:,None]]), axis=1)
    #minmax = 1 - np.mean(mins/maxs)             # minmax
    #return({'mape':mape, 'me':me, 'mae': mae,'mpe': mpe, 'rmse':rmse,'corr':corr, 'minmax':minmax})


def recuperar_dados_train_test(df):
    registros = len(df)
    qtdTreino = int( registros / 100 * 80)
    qtdTest = registros - qtdTreino
    treino = df.iloc[:qtdTreino]
    teste = df.iloc[qtdTreino:registros]
    display('Observations: %d' % registros + ' Treino: %d' % qtdTreino + ' Teste: %d' % qtdTest)
    return  treino,teste

def recuperar_dataset():
    query = Query.getQuery('query_5_1')
    conn = pyodbc.connect(Configuration.getConfigValue('database_connection'))
    query_result = pd.read_sql_query(query,conn)
    df = pd.DataFrame(query_result)
    df['TOTAL_ADD_CS'].astype('float64')
    df['DATA'] = pd.to_datetime(df['DATA'], dayfirst=True)
    df.sort_values('DATA', inplace=True)
    df.set_index('DATA', inplace=True)
    df.head()
    return df

def forecast(best_aic,train,test,full):
    elements = len(test)
    _train = len(train)
    _test = len(test)

    # Build Model
    model = ARIMA(train, order=best_aic)
    model_fit = model.fit(disp=-1)

    # Forecast
    fc, se, conf = model_fit.forecast(elements,alpha=0.05)  # 95% conf

    #Exibindo os erros
    forecast_accuracy(fc,test["TOTAL_ADD_CS"].values)

    # Make as pandas series
    fc_series = pd.Series(fc, index=test.index)
    lower_series = pd.Series(conf[:, 0], index=test.index)
    upper_series = pd.Series(conf[:, 1], index=test.index)

    # Plot
    plt.figure(figsize=(12,5), dpi=100)
    plt.plot(train, label='training')
    plt.plot(test, label='actual')
    plt.plot(fc_series, label='forecast')
    plt.fill_between(lower_series.index, lower_series, upper_series,color='k', alpha=.15)
    plt.title('Forecast x Atual')
    plt.legend(loc='upper left', fontsize=8)
    plt.ylabel("Added amount of code smell")
    plt.xlabel("Month/Year")
    plt.show()

    # Actual vs Fitted
    #model_full = ARIMA(full, order=best_aic)
    #model_fit_full = model_full.fit(disp=-1)
    #model_fit_full.plot_predict(dynamic=False)
    #display(model_fit.summary())


def main():
    # load dataset
    dados = recuperar_dataset()
    if len(dados) == 0:
        display("No data to analyze")
    else:
        quantidade_total = len(dados)
        quantidade_fold =  int(quantidade_total / quantidadeSubSets)

        #check if data train is stationary]
        testar_estacionaridade(dados['TOTAL_ADD_CS'].values)

        #dados aleatoria para treinar o modelo
        best_aic = {}
        aics = []
        treino  = []
        teste  = []

        for i in range(1,quantidadeSubSets):
            df  = dados.head(quantidade_fold*i)
            #split into train and test sets
            treino, teste  = recuperar_dados_train_test(df)
             #verify best aic
            best_aic =  melhor_aic(treino)
            aics.append(best_aic)

        best_aic = min(aics)
        display("AIC Selected: {0} ".format(best_aic))

        # Train & Forecast
        forecast(best_aic,treino,teste,dados)


main()

'Results of Dickey-Fuller Test:'

Test Statistic                  -42.940346
p-value                           0.000000
Lags Used                         1.000000
Number of Observations Used    3777.000000
Critical Value (1%)              -3.432083
Critical Value (5%)              -2.862306
Critical Value (10%)             -2.567178
dtype: float64

'Result adf : The series is  stationary'

'Reject Ho - Time Series is Stationary'

'Results of KPSS Test:'

'KPSS Statistic: 0.09860594259319189'

'p-value: 0.1'

'num lags: 13'

'Critial Values:'

'   10% : 0.347'

'   5% : 0.463'

'   2.5% : 0.574'

'   1% : 0.739'

'Result kpss_test: The series is  stationary'

'Observations: 944 Treino: 755 Teste: 189'

[8184.234012589428,
 8118.291444446714,
 8119.630811354715,
 8113.155223183277,
 8109.509889096366,
 8108.264482914669,
 8109.213507370007,
 8052.12606345779,
 8341.784107268264,
 8088.333779055124,
 8073.979568547864,
 8050.595617292635,
 8046.254814417305,
 8046.581413561945,
 8031.634619207799,
 8026.6321865562395,
 9060.782995221678,
 8339.544510127467,
 8077.69896671859,
 8059.6032341961545,
 8042.148695585214,
 8041.606185724138,
 8042.8285734188685,
 8029.565138837221,
 9909.178863845984,
 9058.786048470593,
 8342.834707388993,
 8081.891599297028,
 8064.817143161223,
 8046.641701659104,
 8045.979304476224,
 8047.419043191116,
 10793.511615543095,
 9899.863611917805,
 9055.723115683757,
 8347.233433923591,
 8103.530526026395,
 8073.061316744488,
 8057.516818169532,
 8066.839913239687,
 11704.063574325533,
 10779.706180997706,
 9892.149297577842,
 9055.346677680682,
 8353.227035323756,
 8503.589379707408,
 8310.28443913405,
 3364.960913354271,
 12636.157963824138,
 11696.371497065

'Best AIC: 16.0 (3, 3, 4)'

'Observations: 1888 Treino: 1510 Teste: 378'

[16497.3627425849,
 16474.1252201241,
 16475.90135561648,
 16469.385297213346,
 16468.868196835392,
 16470.66631114262,
 16468.939636006457,
 16456.18113234401,
 17223.166625827344,
 16435.010631636607,
 16433.809640615593,
 16430.508026575386,
 16432.493773843613,
 16434.05841831147,
 16433.614636057908,
 16435.582853586795,
 18811.233282300924,
 17221.500641463936,
 16437.670214592836,
 16436.6558471194,
 16433.148204495934,
 16435.363953644483,
 16436.63546980371,
 16436.037011060485,
 20590.919594982763,
 18809.572490227587,
 17227.48525524215,
 16452.16570633752,
 16453.326566340165,
 16446.90518418426,
 16469.414211242667,
 16460.7675233492,
 22441.87835229205,
 20582.871627826946,
 18808.308766922637,
 17236.445664426188,
 6192.0329491980365,
 6470.4391671467565,
 16964.072446741768,
 16955.565491274203,
 24336.540871415607,
 22431.331447884917,
 20580.794839624003,
 18832.83379548244,
 11325.608913723367,
 3243.2262120281976,
 3130.6356478694543,
 17896.164785620284,
 26261.443

'Best AIC: 12.0 (2, 7, 3)'

'Observations: 2832 Treino: 2265 Teste: 567'

[24750.802070567435,
 24727.132161419016,
 24721.09473748735,
 24711.877821141876,
 24709.494388842653,
 24710.975869395224,
 24707.187775629107,
 24695.786214046748,
 25941.19286335093,
 24676.840998594085,
 24676.641019961557,
 24678.6394613667,
 24680.019225409284,
 24682.0183838546,
 24683.01003647711,
 24684.484431368393,
 28386.191697223676,
 25940.023984348183,
 24680.709203176095,
 24680.67138643548,
 24682.661809929104,
 24684.10358544942,
 24686.11567355725,
 24687.14458264497,
 31096.711982515633,
 28384.928601828527,
 25953.43835474759,
 24707.897560367142,
 24781.09411072226,
 24706.90266899335,
 24715.831844386354,
 24717.09034177732,
 33910.76324867541,
 31089.203027714706,
 28384.49284406467,
 25957.088400052868,
 25647.718088482718,
 8628.868924770102,
 15836.31172799401,
 25314.571698097996,
 36786.22687184162,
 33900.93269253892,
 31087.45891877293,
 28400.87224595064,
 5486.857226989972,
 27652.532796399923,
 26995.32063216465,
 26335.430627119786,
 39703.0545627540

'Best AIC: 14.0 (2, 5, 4)'

'AIC Selected: (2, 5, 4) '

ValueError: too many values to unpack (expected 3)