In [1]:
import pandas as pd
from pandas import get_dummies
import numpy as np
import logging

FORMAT = "[%(asctime)s] - [%(levelname)s] - [%(funcName)s] - %(message)s"
logging.basicConfig(level=20, format=FORMAT)


In [5]:
df = pd.read_csv("BlackFriday.csv")
df.drop(["Product_Category_1", "Product_Category_2", "Product_Category_3"],axis=1,inplace=True,)
purchase_df = (df[["User_ID", "Product_ID"]].groupby("User_ID")["Product_ID"].apply(list))
print(purchase_df)

User_ID
1000001    [P00069042, P00248942, P00087842, P00085442, P...
1000002    [P00285442, P00112842, P00293242, P00289342, P...
1000003    [P00193542, P00132842, P0098342, P00010242, P0...
1000004    [P00184942, P00346142, P0097242, P00046742, P0...
1000005    [P00274942, P00251242, P00014542, P00031342, P...
1000006    [P00231342, P00190242, P0096642, P00058442, P0...
1000007    [P00036842, P00046742, P00181842, P00117942, P...
1000008    [P00249542, P00220442, P00156442, P00213742, P...
1000009    [P00135742, P00039942, P00161442, P00078742, P...
1000010    [P00085942, P00118742, P00297942, P00266842, P...
1000011    [P00192642, P00110842, P00189642, P00265242, P...
1000012    [P00304242, P00365242, P00080342, P00076742, P...
1000013    [P00129542, P00140742, P00182342, P00034042, P...
1000014    [P00276642, P00265242, P00274942, P00220442, P...
1000015    [P00334242, P00247542, P00338442, P00275142, P...
1000016    [P00244242, P00217742, P00260142, P00248142, P...
1000017    [P000

In [4]:
def get_data():
    df = pd.read_csv("BlackFriday.csv")
    logging.info("Raw data loaded")
    #Dropping product category columns
    df.drop(
        ["Product_Category_1", "Product_Category_2", "Product_Category_3"],
        axis=1,
        inplace=True,
    )

    purchase_df = (
        df[["User_ID", "Product_ID"]].groupby("User_ID")["Product_ID"].apply(list)
    )
    #Dropping product id
    df.drop("Product_ID", axis=1, inplace=True)
    #Casting Occupation column to a string (it was originally an int)
    df["Occupation"] = df["Occupation"].astype(str)
    #Selecting everything except last column which is purchase
    gbc = list(df.columns[:-1])
    grouped = df.groupby(gbc).sum().reset_index()
    print('grouped', grouped)
    grouped["Purchase"] = grouped.Purchase / max(grouped.Purchase)

    dummies = get_dummies(grouped)
    #print("dummies:",dummies)

    dummies.index = dummies.User_ID
    #print("dummies index:",dummies.index)
    dummies.drop("User_ID", inplace=True, axis=1)

    logging.info("Data cleaned")

    return purchase_df, dummies


def get_vectors(df):
    logging.info("Collecting vectors")
    vectors = {}
    for i, uid in enumerate(df.index):
        vectors[uid] = np.array(df.iloc[i])
    logging.info("Vectors ready")
    return vectors


def main():
    purchase_df, user_df = get_data()
    vectors = get_vectors(user_df)

    print("User: 1000001 -> \n", vectors[1000001])
    print("Purchases:")
    for item in purchase_df.loc[1000001]:
        print(item)


if __name__ == "__main__":
    main()

[2019-06-09 09:39:43,765] - [INFO] - [get_data] - Raw data loaded
[2019-06-09 09:39:46,848] - [INFO] - [get_data] - Data cleaned
[2019-06-09 09:39:46,850] - [INFO] - [get_vectors] - Collecting vectors


grouped       User_ID Gender    Age Occupation City_Category  \
0     1000001      F   0-17         10             A   
1     1000002      M    55+         16             C   
2     1000003      M  26-35         15             A   
3     1000004      M  46-50          7             B   
4     1000005      M  26-35         20             A   
5     1000006      F  51-55          9             A   
6     1000007      M  36-45          1             B   
7     1000008      M  26-35         12             C   
8     1000009      M  26-35         17             C   
9     1000010      F  36-45          1             B   
10    1000011      F  26-35          1             C   
11    1000012      M  26-35         12             C   
12    1000013      M  46-50          1             C   
13    1000014      M  36-45          0             C   
14    1000015      M  26-35          7             A   
15    1000016      F  36-45          0             C   
16    1000017      M  51-55          1  

[2019-06-09 09:39:50,263] - [INFO] - [get_vectors] - Vectors ready


User: 1000001 -> 
 [0.         0.03164922 1.         0.         1.         0.
 0.         0.         0.         0.         0.         0.
 0.         1.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         1.         0.         0.         0.
 0.         1.         0.         0.        ]
Purchases:
P00069042
P00248942
P00087842
P00085442
P00085942
P00102642
P00110842
P00004842
P00117942
P00258742
P00142242
P00000142
P00297042
P00059442
P0096542
P00184942
P00051842
P00214842
P00165942
P00111842
P00178242
P00178342
P00183942
P00051442
P00248442
P00210342
P00289942
P0097142
P00255842
P00025442
P00074142
P00058142
P00220642
P00064042
