In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import col, log, row_number

spark = SparkSession.builder \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.executor.cores", "4") \
    .appName("Product Recommendation") \
    .getOrCreate()

df = spark.read.parquet("/home/m1nhd3n/Works/DataEngineer/product_recommendations/data/preprocess/created_target_col")
df.printSchema()

25/02/21 08:11:57 WARN Utils: Your hostname, m1nhd3n resolves to a loopback address: 127.0.1.1; using 192.168.1.10 instead (on interface wlp0s20f3)
25/02/21 08:11:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/21 08:11:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- ncodpers: integer (nullable = true)
 |-- month_idx: integer (nullable = true)
 |-- product_vec: vector (nullable = true)
 |-- feat_vec: vector (nullable = true)
 |-- prev_product_vec: vector (nullable = true)
 |-- new_product_vec: vector (nullable = true)
 |-- target_product_vec: vector (nullable = true)



In [2]:
df.show()

+--------+---------+--------------------+--------------------+--------------------+---------------+------------------+
|ncodpers|month_idx|         product_vec|            feat_vec|    prev_product_vec|new_product_vec|target_product_vec|
+--------+---------+--------------------+--------------------+--------------------+---------------+------------------+
|   15951|        1|(24,[2,15,18],[1....|(17,[0,5,10,12,14...|                NULL|           NULL|              NULL|
|   15951|        2|(24,[15,18],[1.0,...|(17,[0,5,10,12,14...|(24,[2,15,18],[1....|           NULL|    (24,[2],[1.0])|
|   15951|        3|(24,[2,15,18],[1....|(17,[0,5,10,12,14...|(24,[15,18],[1.0,...| (24,[2],[1.0])|              NULL|
|   15951|        4|(24,[2,15,18],[1....|(17,[0,5,10,12,14...|(24,[2,15,18],[1....|           NULL|              NULL|
|   15951|        5|(24,[2,15,18],[1....|(17,[0,5,10,12,14...|(24,[2,15,18],[1....|           NULL|              NULL|
|   15951|        6|(24,[2,15,18],[1....|(17,[0,

In [14]:
test_df = df.select("*").where(df.month_idx == 16).where(df.target_product_vec.isNotNull()).where(df.prev_product_vec.isNotNull())

# Simplest way: Simple machine learning
We will use both last month product status (`product_vec`) and the month before last month also `prev_product_vec`.

In [11]:
train_df = df.select("*")\
    .where(df.month_idx != 16)\
    .where(df.month_idx != 17)\
    .where(df.target_product_vec.isNotNull())\
    .where(df.prev_product_vec.isNotNull())

In [15]:
test_df.show()

+--------+---------+--------------------+--------------------+--------------------+---------------+--------------------+
|ncodpers|month_idx|         product_vec|            feat_vec|    prev_product_vec|new_product_vec|  target_product_vec|
+--------+---------+--------------------+--------------------+--------------------+---------------+--------------------+
|   16705|       16|(24,[2,7,17,23],[...|(17,[5,10,12,14,1...|(24,[2,7,17,18,23...|           NULL|     (24,[18],[1.0])|
|   16731|       16|(24,[2,11,12],[1....|(17,[3,4,5,10,12,...|(24,[2,11,20,23],...|(24,[12],[1.0])|     (24,[23],[1.0])|
|   17151|       16|(24,[4,7,14,17,19...|(17,[0,4,5,10,12,...|(24,[4,7,14,17,19...|           NULL|     (24,[18],[1.0])|
|   17735|       16|(24,[2,4,7,8,12,2...|(17,[3,5,10,12,14...|(24,[2,4,7,8,12,2...|           NULL|(24,[21,22],[1.0,...|
|   18204|       16|(24,[2,7,17,19],[...|(17,[5,10,12,14,1...|(24,[7,17,19,23],...| (24,[2],[1.0])|     (24,[23],[1.0])|
|   18714|       16|      (24,[2

In [16]:
train_df.show()

+--------+---------+--------------------+--------------------+--------------------+---------------+--------------------+
|ncodpers|month_idx|         product_vec|            feat_vec|    prev_product_vec|new_product_vec|  target_product_vec|
+--------+---------+--------------------+--------------------+--------------------+---------------+--------------------+
|   15951|        2|(24,[15,18],[1.0,...|(17,[0,5,10,12,14...|(24,[2,15,18],[1....|           NULL|      (24,[2],[1.0])|
|   16100|        5|(24,[8,12,18,23],...|(17,[5,10,12,14,1...|(24,[2,8,12,18,23...|           NULL|      (24,[2],[1.0])|
|   16152|        2|(24,[4,17,18,23],...|(17,[5,10,12,14,1...|(24,[4,17,18,21,2...|           NULL|(24,[21,22],[1.0,...|
|   16152|       14|(24,[4,17,18,23],...|(17,[5,10,12,14,1...|(24,[4,17,18,21,2...|           NULL|(24,[21,22],[1.0,...|
|   16193|        2|(24,[8,12,18,23],...|(17,[0,4,5,10,12,...|(24,[8,12,18,23],...|           NULL|      (24,[4],[1.0])|
|   16193|        5|(24,[8,12,18

In [17]:
test_df.count()

25738

In [18]:
train_df.count()

360214

In [19]:
test_df.write.parquet("/home/m1nhd3n/Works/DataEngineer/product_recommendations/data/preprocess/test_df_include_prev_month")
train_df.write.parquet("/home/m1nhd3n/Works/DataEngineer/product_recommendations/data/preprocess/train_df_include_prev_month")

                                                                                