# TCC 2
* Aluno: Paulo Henrique Costa Gontijo
* Matrícula: 15/0143800


In [0]:
root_folder = 'abfss://kaggle@stadlsgen2tcc.dfs.core.windows.net/'

In [0]:
dbutils.fs.ls(root_folder)

## Importação das Bibliotecas

In [0]:
import os
import pandas as pd

file_name = 'test.jsonl/'
file_path = ''.join([root_folder, file_name])
file_path

## Configurações de Cluster

In [0]:
from pyspark import SparkConf

# show configured parameters
print(SparkConf().getAll())

# set log level
spark.sparkContext.setLogLevel("FATAL")

* Conferindo o instanciamento

In [0]:
spark

* Leitura do arquivo disponibilizado na competição kaggle: [link.](https://www.kaggle.com/competitions/otto-recommender-system/overview/)

In [0]:
df_raw = spark.read \
    .format("csv") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .csv('abfss://kaggle@stadlsgen2tcc.dfs.core.windows.net/teste_csv.csv').persist()
df_raw.columns

* Conferindo número de partições e conteúdo

In [0]:
print(df_raw.rdd.getNumPartitions())

In [0]:
df_raw.display()

* Contagem de linhas em arquivo bruto

In [0]:
df_raw.count()

In [0]:
df_raw.printSchema()

* Processamento de Json para formato tabular

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [0]:
def explode_df_json_to_tabular(df_raw):
    df_explode = df_raw.withColumn('events_explode', F.explode('events'))\
                    .withColumn('session', F.col('session').cast(T.IntegerType()))\
                    .withColumn('aid', F.col('events_explode.aid').cast(T.IntegerType()))\
                    .withColumn('ts', F.col('events_explode.ts'))\
                    .withColumn('type', F.col('events_explode.type'))\
                    .drop('events', 'events_explode')
    return df_explode

In [0]:
df_explode = explode_df_json_to_tabular(df_raw)
df_explode.show(10)

* Contagem de linhas após o tratamento

In [0]:
df_explode.count()

* Schema pós tratamento

In [0]:
df_explode.printSchema()

* Categorização de coluna alvo string

In [0]:
from pyspark.ml.feature import StringIndexer

def transform_indexer(df_explode):
    indexer = StringIndexer(inputCol='type', outputCol='type_cat')
    indexer_fitted = indexer.fit(df_explode)
    df_indexed = indexer_fitted.transform(df_explode)
    df_indexed_type = df_indexed.withColumn('type_cat', F.col('type_cat').cast(T.IntegerType())).drop('type')
    df_indexed_type_renamed = df_indexed_type.withColumnRenamed('session','userCol')
    df_indexed_droped = df_indexed_type_renamed.dropDuplicates()
    df = df_indexed_droped.drop('ts')
    return df

In [0]:
df_indexed = transform_indexer(df_explode).persist()
df_indexed.show(10)

* Conferência de schema pós tratativas

In [0]:
df_indexed.printSchema()

## Análise de Clientes

* Quantidade de clientes únicos

In [0]:
df_indexed.select('userCol').distinct().count()

## Análise de Produtos

* Quantidade de produtos únicos

In [0]:
df_indexed.select('aid').distinct().count()

# Escrita de DataFrame tratado

In [0]:
df_indexed.coalesce(20)\
          .write.format('parquet')\
          .mode("overwrite")\
          .save('/'.join([os.getcwd(), 'refined-dataset']))

In [0]:
df_raw.unpersist()
df_explode.unpersist()
df_indexed.unpersist()

# Lendo o dataset refinado

In [0]:
df_refined = spark.read.format('parquet').load('refined-dataset').persist()
df_refined.show()

# Filtrando Dataset de Treino para Clientes que Compraram

In [0]:
# df_pre_moodel = df_refined.select(df_refined.columns).where(F.col('type_cat') == 2)
df_pre_moodel = df_refined
df_pre_moodel.show()

In [0]:
df_pre_moodel.select('userCol').distinct().count()

In [0]:
df_pre_moodel.count()

### Adicionar pipeline de hiperparametrização
* http://restanalytics.com/2019-02-27-Hyperparameter-Tuning-Alternating-Least-Squares-Recommender-System/

In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

# (training, test) = df_pre_moodel.randomSplit([0.6, 0.4])
training = df_pre_moodel

In [0]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(rank=10,
          maxIter=20,
          regParam=0.01,
          userCol="userCol",
          itemCol="aid",
          ratingCol="type_cat",
          coldStartStrategy="drop",
          implicitPrefs=True,
          checkpointInterval=1)
model = als.fit(training)

In [0]:
# # Evaluate the model by computing the RMSE on the test data
# predictions = model.transform(test)
# evaluator = RegressionEvaluator(metricName="rmse", labelCol="type_cat",
#                                 predictionCol="prediction")
# rmse = evaluator.evaluate(predictions)
# print("Root-mean-square error = " + str(rmse))

# Utilizando dataset de teste da competição

In [0]:
file_name = 'test.jsonl'
file_path = '/'.join([os.getcwd(), file_name])

In [0]:
df_test = spark.read \
    .format("json") \
    .option("inferSchema", "true") \
    .option("header", "true") \
    .json(file_path).persist()

In [0]:
df_test.show()

In [0]:
df_test.count()

# Tratamento de subset para recomendações

In [0]:
df_pre_transform = explode_df_json_to_tabular(df_test)
df_pre_transform.count()

In [0]:
df_subset = transform_indexer(df_pre_transform).select('userCol').distinct()
df_subset.count()

In [0]:
df_pre_moodel.join(df_subset, df_pre_moodel.userCol == df_subset.userCol, 'inner')\
                    .drop(df_subset.userCol).select('userCol').distinct().count()

In [0]:
df_recommend = model.recommendForUserSubset(df_subset, 20)
df_recommend.printSchema()

# Escrevendo Dataset de Recomendações em JSON

In [0]:
df_recommend.write.format('json')\
                  .mode("overwrite")\
                  .save('/'.join([os.getcwd(), 'raw_predictions.json']))

In [0]:
df_recommend.unpersist()

# Leitura de Dataset RAW_PREDICTIONS
Nesta etapa será realizada a conversão do objeto JSON em formato tabular, para se adequar ao formato de submissão da competição.

In [0]:
df_recommend_json = spark.read.format('json').load('./raw_predictions.json').persist()
df_recommend_json.printSchema()

In [0]:
df_recommend_json.show(10, truncate=False)

* Tratamento JSON->Tabular

In [0]:
df_recommend_json_exploded = df_recommend_json.withColumn('recommendations_explode', F.explode('recommendations'))\
                                    .withColumn('aid', F.col('recommendations_explode.aid').cast(T.IntegerType()))\
                                    .drop('recommendations_explode', 'recommendations')
df_recommend_json_exploded.printSchema()

In [0]:
df_recommend_json_exploded.show(10)

# Agrupando top 20 itens por usuário

In [0]:
df_submission_raw = df_recommend_json_exploded.withColumn('userCol', F.col('userCol').cast(T.StringType()))\
                                                .groupby('userCol').agg(F.collect_list('aid').alias("labels")).persist()
df_submission_raw = df_submission_raw.withColumnRenamed('userCol', 'session_type')
df_submission_raw.show(10, truncate=False)

In [0]:
df_submission_raw.count()

# Criando dataset no formato de submissão

In [0]:
users_list_raw = [r[0] for r in df_submission_raw.select('session_type').toLocalIterator()]
users_list_raw[:5]

In [0]:
users_list_clicks = [s + '_clicks' for s in users_list_raw]
users_list_clicks[:5]

In [0]:
users_list_carts = [s + '_carts' for s in users_list_raw]
users_list_carts[:5]

In [0]:
users_list_orders = [s + '_orders' for s in users_list_raw]
users_list_orders[:5]

In [0]:
labels_list = [r[0] for r in df_submission_raw.select('labels').toLocalIterator()]
labels_list[:5]

In [0]:
df_clicks = spark.createDataFrame(zip(users_list_clicks, labels_list), ['session_type', 'labels'])
df_clicks.show(5)

In [0]:
df_carts = spark.createDataFrame(zip(users_list_carts, labels_list), ['session_type', 'labels'])
df_carts.show(5)

In [0]:
df_orders = spark.createDataFrame(zip(users_list_orders, labels_list), ['session_type', 'labels'])
df_orders.show(5)

In [0]:
from functools import reduce
from pyspark.sql import DataFrame

dfs = [df_clicks,df_carts,df_orders]
df_submission = reduce(DataFrame.unionAll, dfs).persist()
df_submission = df_submission.withColumn("labels",F.concat_ws(" ",F.col("labels")))
df_submission.show()

In [0]:
df_submission.count()

# Escrevendo dataset de submissão

In [0]:
df_submission.orderBy('session_type').coalesce(1)\
                                     .write.format('csv')\
                                     .mode("overwrite")\
                                     .option("header", True)\
                                     .save('/'.join([os.getcwd(), 'submission']))

# Referências Bibliográficas

> https://github.com/Kaggle/kaggle-api, acessado em 24/01/2023.

> https://www.youtube.com/watch?v=aBNQzWV_UmE, acessado em 24/01/2023.

> https://databricks-prod-cloudfront.cloud.databricks.com/public/4027ec902e239c93eaaa8714f173bcfc/968100988546031/157591980591166/8836542754149149/latest.html, acessado em 25/01/2023.

> https://blog.clairvoyantsoft.com/apache-spark-out-of-memory-issue-b63c7987fff, acessado em 25/01/2023.

> https://spark.apache.org/docs/latest/api/python/getting_started/quickstart_ps.html, acessado em 26/01/2023.

> https://docs.fast.ai/collab.html, acessado em 24/01/2023.

> https://course.fast.ai/Lessons/lesson7.html, acessado em 24/01/2023.

> https://github.com/fastai/fastbook/blob/master/08_collab.ipynb, acessado em 24/01/2023.

> https://www.kaggle.com/code/vchulski/tutorial-collaborative-filtering-with-pyspark

> https://medium.com/analytics-vidhya/crafting-recommendation-engine-in-pyspark-a7ca242ad40a

> https://www.freecodecamp.org/news/8-clustering-algorithms-in-machine-learning-that-all-data-scientists-should-know/

> https://www.youtube.com/watch?v=fpqa0_U4zb4

> https://www.youtube.com/watch?v=58OjaDH2FI0&t=742s

> https://spark.apache.org/docs/latest/ml-collaborative-filtering.html

> https://spark.apache.org/docs/latest/configuration.html#memory-management