In [1]:
from pyspark import *
from pyspark.sql.functions import *

# Exploratory data analysis (EDA)

In [3]:
# Dataset original
#display(dbutils.fs.ls("dbfs:/FileStore/tables/"))
df_desafio = spark.read.format("csv").options(header='true').load("/FileStore/tables/*.csv")

In [4]:
# Elimina na's e duplicados do df
df_desafio_v2 = df_desafio.dropna(how='any').dropDuplicates()

# Ajusta tipo de colunas
df_desafio_v2 = df_desafio.selectExpr(
  'cast(time as timestamp) time',
  'ca',
  'unit',
  'scp',
  'station',
  'linename',
  'division',
  'desc',
  'cast(entries as int) entries',
  'cast(exits as int) exits'
)

# Temporal Features
df_desafio_v2 = df_desafio_v2.withColumn(
  "dt_year",
  year(col("time"))
).withColumn(
  "dt_month",
  month(col("time"))
).withColumn(
  "dt_day",
  dayofmonth(col("time"))
).withColumn(
  "dt_dayofy",
  dayofyear(col("time"))
).withColumn(
  "dt_hour",
  hour(col("time"))
).withColumn(
  "dt_min",
  minute(col("time"))
).withColumn(
  "dt_week_no",
  weekofyear(col("time"))
).withColumn(
  "dt_int",
  unix_timestamp(col("time"))
)

In [5]:
#79609191 / 79130015 / 79045675
count_desafio = df_desafio.count()
count_desafio_na = df_desafio.dropna(how='any').count()
count_desafio_final = df_desafio.dropna(how='any').dropDuplicates().count()

df_amostras = sc.parallelize([
  ('antes',count_desafio,0,0,0),
  ('depois',0,count_desafio_final,count_desafio-count_desafio_na,count_desafio_na-count_desafio_final)
]).toDF(['AMOSTRAS','TOTAL','UNICO','NA','DUPLICADO'])

display(df_amostras)

AMOSTRAS,TOTAL,UNICO,NA,DUPLICADO
antes,79609191,0,0,0
depois,0,79045675,479176,84340


In [6]:
df_amostras_v2 = sc.parallelize([
  ('',count_desafio-count_desafio_na,count_desafio_na-count_desafio_final)
]).toDF(['AMOSTRAS','NA','DUPLICADO'])

display(df_amostras_v2)

AMOSTRAS,NA,DUPLICADO
,479176,84340


In [7]:
# Infos do dataset
display(df_desafio_v2)

In [8]:
display(df_desafio_v2.groupBy(df_desafio_v2.columns).count())

In [9]:
pivot_cols = ['ca']
keys = ['time','ca']

before = df_desafio_v2 

#Helper function to recursively join a list of dataframes
#Can be simplified if you only need two columns
def join_all(dfs,keys):
    if len(dfs) > 1:
        return dfs[0].join(join_all(dfs[1:],keys), on = keys, how = 'inner')
    else:
        return dfs[0]

dfs = []
combined = []
for pivot_col in pivot_cols:
    pivotDF = before.groupBy(keys).pivot(pivot_col).count()
    new_names = pivotDF.columns[:len(keys)] +  ["e_{0}_{1}".format(pivot_col, c) for c in pivotDF.columns[len(keys):]]        
    df = pivotDF.toDF(*new_names).fillna(0)    
    combined.append(df)

join_all(combined,keys).show()

In [10]:
display(join_all(combined,keys))

### Convert features to Categorical

In [12]:
ca = df_desafio_v2.select("ca").distinct().rdd.flatMap(lambda x: x).collect()
ca_expr = [F.when(F.col("ca") == ty, 1).otherwise(0).alias("e_ca_" + ty) for ty in ca]

display(df_desafio_v2.select("ca", *ca_expr))

In [13]:
display(df_desafio_v2.select("ca", *ca_expr))

In [14]:
df_v2.info()

In [15]:
# Resumo de cada feature
for coluna in df_v2.columns:
    print(df_v2[coluna].describe())
    print('\n\n')

### Histogram

In [17]:
df_hist = df_v2.sort_index(axis=1)

In [18]:
df_hist.hist(figsize=(20, 20), bins=50, xlabelsize=8, ylabelsize=8);

### Correlation

In [20]:
corr = df_v2.corr()
plt.figure(figsize=(11, 9))

# Mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Figure
f, ax = plt.subplots(figsize=(11, 9))

# Colormap
cmap = sns.diverging_palette(10, 220, as_cmap=True)

sns.heatmap(corr, 
            cmap=cmap, 
            mask=mask,
            vmax=.3,             
            linewidths=.5,
            center=0,
            annot=True, 
            annot_kws={"size": 8}, 
            square=True, 
            cbar_kws={"shrink": .5});

In [21]:
# Tabela de correlação
corr

### Linear Regression - entries/exits  vs feature

In [23]:
df_v3 = df_v2
y_list = ['entries','exits']

for y in y_list:
    fig, ax = plt.subplots(round(len(df_v3.columns) / 3), 3, figsize = (20, 20))

    for i, ax in enumerate(fig.axes):
        if i < len(df_v3.columns) - 1:        
            sns.regplot(x=df_v3[df_v3.columns[i]],y=y, data=df_v3, ax=ax).set_title("LR for '"+y+"' vs "+df_v3.columns[i])

### Underfitting vs. Overfitting

In [25]:
# X = df_v2
# y = df_v2['entries']

# polynomial_features = PolynomialFeatures(degree=1, include_bias=False)
# linear_regression = linear_model.LinearRegression()
# pipeline = Pipeline([("polynomial_features", polynomial_features),
#                      ("linear_regression", linear_regression)])
# pipeline.fit(X, y)

# # Evaluate the models using crossvalidation
# scores = cross_val_score(pipeline, X, y, scoring="neg_mean_squared_error", cv=10)

# X_test = X[['ca', 'unit', 'scp', 'station', 'linename', 'division', 'desc', 'exits', 'year', 'month', 'day', 'hour']]
# plt.plot(X_test, pipeline.predict(X_test), label="Model")
# # plt.plot(X_test, X_test, label="True function")
# plt.scatter(X, y, edgecolor='b', s=20, label="Samples")
# plt.xlabel("x")
# plt.ylabel("y")
# plt.xlim((0, 1))
# plt.ylim((-2, 2))
# plt.legend(loc="best")
# plt.show()

### Outlier Detection

In [27]:
# rng = np.random.RandomState(42)
# clf = ensemble.IsolationForest(max_samples=100, random_state=rng)
# clf.fit(df_v2)

# Model Selection

In [29]:
# Dataset para model selection
df_ms = df_v2.copy()

# Predict Y
del df_ms['entries']
Y = df_v2['entries']

# List of results
resultados = [['status','model','mean','std','time']]

# Model selection - Score
for var in modelos:
    start = time.time()
    try:
        print(var)
        clf = var()
        scores = cross_val_score(clf, df_ms, Y, cv=10)
        print('Mean score: ',np.mean(scores), '/ Std Score: ',np.std(scores))
        resultados.append(['ok',var.__name__,np.mean(scores),np.std(scores),time.time() - start])
    except(Exception):
        print('>> Validar parâmetros.')
        resultados.append(['erro',var.__name__,None,None,time.time() - start])
        pass
    finally:            
        print('-'*100)

### Create XLS to results

In [31]:
writer = pd.ExcelWriter('/dbfs/FileStore/tables/resultados_modelos.xlsx', engine='xlsxwriter')
df_result = pd.DataFrame(resultados[1:])
df_result.columns = resultados[0]
df_result.to_excel(writer, sheet_name='Sheet1', index=False)
writer.save()

### Fit Model

In [33]:
# Seleciona o melhor modelo
df_result = df_result.sort_values(by='mean', ascending=False)
model_selected = pd.Series.tolist(df_result[:1]['model'])
# model_selected[0]

# Fit do modelo
# for item in modelos:
#     if str(item).find(model_selected[0]) > 0:
#         model_result = item.fit(X=df_ms,y=Y)

model_result = ensemble.GradientBoostingRegressor().fit(X=df_ms,y=Y)

In [34]:
# Ajuste para próximo ano
df_mp = df_ms.copy()
df_mp['year'] = df_mp['year']+1

df_mp['entries_new'] = list(map(int,model_result.predict(X=df_mp)))
df_mp['entries'] = df_v2['entries']

print('accuracy_score:\nnormalize_true = {:f} \nnormalize_false = {:f}'.format(
    accuracy_score(df_mp['entries_new'], df_mp['entries']),
    accuracy_score(df_mp['entries_new'], df_mp['entries'], normalize=False)
    ))