# Import packages and load datasets

In [10]:
import glob, os
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from joblib import dump, load
from sklearn import linear_model, ensemble
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

import lime
import lime.lime_tabular

import time

# Warnings of system
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
# Dataset original
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('../dist/', "*.csv"))))
# df = pd.read_csv('../dist/df_subset.csv')

In [None]:
# %%time
# # Datas
# df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S.%f')
# df['year'] = [data.year for data in df['time']]
# df['month'] = [data.month for data in df['time']]
# df['day'] = [data.day for data in df['time']]
# df['hour'] = [data.hour for data in df['time']]

# # Dataset secundário
# # df_v2 = df[df.columns[1:]]
# df_v2 = df.copy()

# Functions & Variables

In [19]:
modelos = [
    # linear_model
    linear_model.LinearRegression,
#     linear_model.BayesianRidge,
#     linear_model.HuberRegressor,
    linear_model.Lasso,
#     linear_model.PassiveAggressiveRegressor,
#     linear_model.RANSACRegressor,
    linear_model.Ridge,
#     linear_model.SGDRegressor,
    
    # ensemble
#     ensemble.AdaBoostRegressor,
#     ensemble.BaggingRegressor,
#     ensemble.ExtraTreesRegressor,
    ensemble.GradientBoostingRegressor,
    ensemble.RandomForestRegressor
]

# Exploratory data analysis (EDA)

In [None]:
df_v2.isna().sum()

In [None]:
# Teste para colunas vazias
df_na = df[[column for column in df if df[column].count() / len(df) >= 0.99]]

In [None]:
if pd.Series.tolist(df.columns) == pd.Series.tolist(df_na.columns):
    print("base sem null's! =)")
else:
    print("base com null's! =(")

In [None]:
# Infos do dataset
df.info()

In [None]:
# Descrição das features númericas do dataset
df.describe()

In [None]:
# Leitura do cabeçalho
df.head()

### Boxplot

In [None]:
# %%time
# x_list = ['day','month','year','hour']
# y_list = ['exits','entries']

# for y in y_list:
#     fig, ax = plt.subplots(int(len(x_list)/2), 2, figsize = (18, 12))

#     for i, ax in enumerate(fig.axes):
#         sns.set(style="ticks", palette="deep")
#         sns.boxplot(
#             x=x_list[i], 
#             y=y,
#             data=df,
#             ax=ax
#         ).set_title("Boxplot for '"+y+"': "+x_list[i])

### Convert features to Categorical

In [None]:
# Conversão de object para númerico
for cat_feature in df_v2.select_dtypes(include=['object']).columns:
    df_v2[cat_feature] = pd.Categorical(df_v2[cat_feature]).codes

In [None]:
%%time
df_v2 = pd.read_csv('df_v2.csv')
df_v2.info()

In [None]:
# Resumo de cada feature
for coluna in df_v2.columns:
    print(df_v2[coluna].describe())
    print('\n\n')

### Histogram

In [None]:
df_hist = df_v2.sort_index(axis=1)

In [None]:
df_hist.hist(figsize=(20, 20), bins=50, xlabelsize=8, ylabelsize=8);

### Correlation

In [None]:
corr = df_v2.corr()
plt.figure(figsize=(11, 9))

# Mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Figure
f, ax = plt.subplots(figsize=(11, 9))

# Colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

sns.heatmap(corr, 
            cmap=cmap, 
            mask=mask,
            vmax=.3,             
            linewidths=.5,
            center=0,
            annot=True, 
            annot_kws={"size": 8}, 
            square=True, 
            cbar_kws={"shrink": .5});

In [None]:
# Tabela de correlação
corr
#del df_v2['time']

### Linear Regression - entries/exits  vs feature

In [None]:
# y_list = ['entries','exits']

# for y in y_list:
#     fig, ax = plt.subplots(round(len(df_v2.columns) / 3), 3, figsize = (20, 20))

#     for i, ax in enumerate(fig.axes):
#         if i < len(df_v2.columns) - 1:
# #             print(y, i, len(df_v2.columns))
#             sns.regplot(x=df_v2.columns[i],y=y, data=df_v2, ax=ax).set_title("LR for '"+y+"' vs "+df_v2.columns[i])

### Underfitting vs. Overfitting

In [None]:
# X = df_v2
# y = df_v2['entries']

# polynomial_features = PolynomialFeatures(degree=1, include_bias=False)
# linear_regression = linear_model.LinearRegression()
# pipeline = Pipeline([("polynomial_features", polynomial_features),
#                      ("linear_regression", linear_regression)])
# pipeline.fit(X, y)

# # Evaluate the models using crossvalidation
# scores = cross_val_score(pipeline, X, y, scoring="neg_mean_squared_error", cv=10)

# X_test = X[['ca', 'unit', 'scp', 'station', 'linename', 'division', 'desc', 'exits', 'year', 'month', 'day', 'hour']]
# plt.plot(X_test, pipeline.predict(X_test), label="Model")
# # plt.plot(X_test, X_test, label="True function")
# plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
# plt.xlabel("x")
# plt.ylabel("y")
# plt.xlim((0, 1))
# plt.ylim((-2, 2))
# plt.legend(loc="best")
# plt.show()

### Outlier Detection

In [21]:
# rng = np.random.RandomState(42)
# clf = ensemble.IsolationForest(max_samples=100, random_state=rng)
# clf.fit(df_v2)
df_v2

Unnamed: 0,ca,unit,scp,station,linename,division,desc,exits,year,month,day,hour
0,1,49,173,410,29,0,20,928793.0,2010,4,17,4
1,1,49,174,410,29,0,20,566683.0,2010,4,17,4
2,1,49,200,410,29,0,20,3943492.0,2010,4,17,4
3,1,49,201,410,29,0,20,3733289.0,2010,4,17,4
4,1,49,202,410,29,0,20,3021589.0,2010,4,17,4
5,1,49,203,410,29,0,20,2575465.0,2010,4,17,4
6,1,49,204,410,29,0,20,1751238.0,2010,4,17,4
7,1,49,205,410,29,0,20,907884.0,2010,4,17,4
8,1,49,206,410,29,0,20,409266.0,2010,4,17,4
9,1,49,216,410,29,0,20,0.0,2010,4,17,4


# Model Selection

In [22]:
%%time
# Dataset para model selection
df_ms = df_v2.copy()

# Predict Y
# del df_ms['entries']
# Y = df_v2['entries']

# List of results
resultados = [['status','model','mean','std','time']]

# Model selection - Score
for var in modelos:
    start = time.time()
    try:
        print(var)
        clf = var()
        scores = cross_val_score(clf, df_ms, Y, cv=10)
        print('Mean score: ',np.mean(scores), '/ Std Score: ',np.std(scores))
        resultados.append(['ok',var.__name__,np.mean(scores),np.std(scores),time.time() - start])
    except(Exception):
        print('>> Validar parâmetros.')
        resultados.append(['erro',var.__name__,None,None,time.time() - start])
        pass
    finally:            
        print('-'*100)

<class 'sklearn.linear_model.base.LinearRegression'>
Mean score:  -0.6400744823280323 / Std Score:  1.2453092239778936
----------------------------------------------------------------------------------------------------
<class 'sklearn.linear_model.coordinate_descent.Lasso'>
----------------------------------------------------------------------------------------------------


KeyboardInterrupt: 

In [24]:
resultados

[['status', 'model', 'mean', 'std', 'time'],
 ['ok',
  'LinearRegression',
  -0.6400744823280323,
  1.2453092239778936,
  1476.4244894981384]]

### Create XLS to results

In [None]:
writer = pd.ExcelWriter('resultados_modelos.xlsx', engine='xlsxwriter')
df_result = pd.DataFrame(resultados[1:])
df_result.columns = resultados[0]
df_result.to_excel(writer, sheet_name='Sheet1', index=False)
writer.save()

### Fit Model

In [None]:
# Seleciona o melhor modelo
# df_result = df_result.sort_values(by='mean', ascending=False)
# model_selected = pd.Series.tolist(df_result[:1]['model'])
# model_selected[0]

# Fit do modelo
# for item in modelos:
#     if str(item).find(model_selected[0]) > 0:
#         model_result = item.fit(X=df_ms,y=Y)
%%time
Y = df_v2['entries']
del df_v2['entries']
model_result = ensemble.GradientBoostingRegressor().fit(X=df_v2,y=Y)

In [None]:
# Ajuste para próximo ano
df_mp = df_ms.copy()
df_mp['year'] = df_mp['year']+1

df_mp['entries_new'] = list(map(int,model_result.predict(X=df_mp)))
df_mp['entries'] = df_v2['entries']

print('accuracy_score:\nnormalize_true = {:f} \nnormalize_false = {:f}'.format(
    accuracy_score(df_mp['entries_new'], df_mp['entries']),
    accuracy_score(df_mp['entries_new'], df_mp['entries'], normalize=False)
    ))

# PILOTO

In [3]:
%%time
df_v2 = pd.read_csv('df_v2.csv')
del df_v2['time']
del df_v2['Unnamed: 0']
df_v2 = df_v2.dropna(axis=0, how='any')
df_v2.isna().sum()

Wall time: 3min 27s


In [5]:
%%time
# df_v2.isna().sum()
df_v2

Wall time: 0 ns


Unnamed: 0,ca,unit,scp,station,linename,division,desc,entries,exits,year,month,day,hour
0,1,49,173,410,29,0,20,2704717.0,928793.0,2010,4,17,4
1,1,49,174,410,29,0,20,2697632.0,566683.0,2010,4,17,4
2,1,49,200,410,29,0,20,1127722.0,3943492.0,2010,4,17,4
3,1,49,201,410,29,0,20,2425570.0,3733289.0,2010,4,17,4
4,1,49,202,410,29,0,20,2214633.0,3021589.0,2010,4,17,4
5,1,49,203,410,29,0,20,1958493.0,2575465.0,2010,4,17,4
6,1,49,204,410,29,0,20,2557379.0,1751238.0,2010,4,17,4
7,1,49,205,410,29,0,20,4325004.0,907884.0,2010,4,17,4
8,1,49,206,410,29,0,20,3281631.0,409266.0,2010,4,17,4
9,1,49,216,410,29,0,20,500.0,0.0,2010,4,17,4


In [6]:
%%time
Y = df_v2['entries']
del df_v2['entries']
model_result = ensemble.GradientBoostingRegressor().fit(X=df_v2,y=Y)

Wall time: 2h 26min 40s


In [11]:
# save the model to disk
dump(model_result, 'model_result_v1.joblib')

['model_result_v1.joblib']

In [16]:
%%time
df_mp = df_v2.copy()
clf = load('model_result_v1.joblib')
clf.predict(df_mp)

Wall time: 2min


In [18]:
list(map(int,clf.predict(X=df_mp)))

[3456561,
 3456561,
 4199463,
 4199463,
 3902573,
 3652880,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3902573,
 3652880,
 3456561,
 3456561,
 3902573,
 3902573,
 4199463,
 5873297,
 4694079,
 3902573,
 4199463,
 5873297,
 5873297,
 22054823,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 5873297,
 3456561,
 3652880,
 3456561,
 3456561,
 3456561,
 3456561,
 5873297,
 5873297,
 3456561,
 3456561,
 3902573,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 4199463,
 3456561,
 4199463,
 3456561,
 4199463,
 4694079,
 3456561,
 3456561,
 3456561,
 5873297,
 3456561,
 3456561,
 3456561,
 3456561,
 3902573,
 5146651,
 5873297,
 5873297,
 4199463,
 3902573,
 3456561,
 3456561,
 3456561,
 3456561,
 5873297,
 3902573,
 3902573,
 4199463,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3902573,
 3456561,
 3456561,
 3456561,
 3456561,
 3902573,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 3456561,
 5873297,