## BASKET ANALYSIS

In [9]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from datetime import date
from datetime import time
import datetime as dt

import mlxtend.frequent_patterns 
import mlxtend.preprocessing

In [37]:
order_detail_df = pd.read_csv("data/order_processed.csv")
user_df = pd.read_csv("data/user_processed.csv")
listening_detail_df = pd.read_csv("data/listening_processed.csv")
clustering_df = pd.read_csv("data/clustering_processed.csv")

#### Xử lý kiểu dữ liệu

In [45]:
##### Đổi tên cột
listening_detail_df.columns=["PlaylistID","Playlist Name","Category","SubCategory","PlaylistType","PlaylistDuration(min)",
                            "ActualDuration(min)","UserID","ListeningTime","ListeningDate","ListeningDatetime"]

In [58]:
listening_detail_df["ListeningDate"] = listening_detail_df["ListeningDate"].astype("datetime64")
listening_detail_df["PlaylistID"] = listening_detail_df["PlaylistID"].astype("object")

In [59]:
listening_detail_df.dtypes

PlaylistID                       object
Playlist Name                    object
Category                         object
SubCategory                      object
PlaylistType                     object
PlaylistDuration(min)           float64
ActualDuration(min)             float64
UserID                            int64
ListeningTime                    object
ListeningDate            datetime64[ns]
ListeningDatetime                object
dtype: object

In [None]:
##### Chuyển dữ liệu datetime
order_detail_df["Transaction Date"]=order_detail_df["Transaction Date"].astype("datetime64")

#### Xử lý outliers của listening_detail_df
1. Drop những playlist có duration ngắn
2. Xử lý các megahit và các playlist có frequency quá thấp ==> vẽ phân phối frequency để nhìn thấy các outliers

#### Function definitions

In [None]:
def frequency_items (x,y):
    fx_=sum([x in i for i in my_basket])
    fy_=sum([y in i for i in my_basket])
    
    fxy_=sum([all(z in i for z in [x,y]) for i in my_basket])
    
    support=fxy_/len(my_basket)
    confidence = support/(fx_/len(my_basket))
    lift =confidence /(fy_/len(my_basket))
    if confidence ==1:
        conviction = 0
    else:
        conviction=(1-(fy_/len(my_basket)))/(1-confidence)
    
    print("Support = {}".format(round(support,2)))
    print("Confidence = {}".format(round(confidence,2)))
    print("Lift= {}".format(round(lift,2)))
    print("Conviction={}".format(round(conviction,2)))

def frequency_item(x,df):
    '''
        Return tần suất xuất hiện của item trong các lượt nghe
    '''
    fx_=sum([x in i for i in df])
    support = fx_/len(df)
    return support

### Lấy danh sách các playlists theo transaction
1. Một transaction: danh sách playlits mà users nghe trong vòng 15 ngày
2. playlist_basket: chứa các playlist theo transaction 15 ngày

##### 1. Tính danh sách nghe mỗi 15 ngày của các users

In [190]:
def get_playlists_in_timewindow(user_id,timewindow = 15 ):
    
    user_basket = []
    
    df = listening_detail_df[listening_detail_df["UserID"]==user]
    t_min = df['ListeningDate'].min()
    t_max = df['ListeningDate'].max()

    ### tính số transactions có thể có của mỗi users nếu chia ra 15 ngày là 1 transaction
    distance = (t_max - t_min).days
    div_delta = int(distance/time_window)
    mod_delta = distance%time_window
    div_delta,mod_delta
    trans_cnt = (div_delta+1) if mod_delta > 0 else div_delta

    ### Xác định các mốc thời gian tương ứng
    freq = str(timewindow)+'D'
    date_points = pd.date_range(start =t_min.date(),periods = trans_cnt+1,freq =freq)
    
    ### Tính playlists của các transactions ứng với các date_point ở trên <= <
    for i in range(trans_cnt):
        playlists = df[df.ListeningDate.between(date_points[i], date_points[i+1])]["PlaylistID"].tolist()
        if len(playlists)>0:
            user_basket.append(playlists)
    
    return(user_basket)

In [191]:
playlist_basket = []
users = listening_summary_df["UserID"].tolist()
for user in users[:10]:
    user_basket = get_playlists_in_timewindow(user)
    playlist_basket += user_basket

In [186]:
user_basket2 = [[3907, 3907],[3909, 3908]]
playlist_basket += user_basket2
user_basket2               

[[3907, 3907], [3909, 3908]]

#### Basket Analysis on listening detail

In [None]:
encode_=mlxtend.preprocessing.TransactionEncoder()
encode_arr=encode_.fit_transform(data)
## 3. Converting to dataframe.

encode_df=pd.DataFrame(encode_arr, columns=encode_.columns_)
encode_df

In [None]:
## 4. Calculating support.

md=mlxtend.frequent_patterns.apriori(encode_df)
md_minsup=mlxtend.frequent_patterns.apriori(encode_df,
                                           min_support=0.01, 
                                            use_colnames=True)
md_minsup.head(20)

In [None]:
## 5. Creating rules (Metric: Confidence) Antecedents ⇒ Consequents

rules=mlxtend.frequent_patterns.association_rules(
md_minsup, metric="confidence",min_threshold=0.06,support_only=False)

rules.head(20)

In [None]:
## 6.Creating rules (Metric: Lift) Antecedents ⇒ Consequents

rules2=mlxtend.frequent_patterns.association_rules(
md_minsup, metric="lift",min_threshold=0.06,support_only=False)

rules2.head(20)

In [None]:
## 7. Scatter plot
## Scatter plots help us to evaluate general tendencies of rules between antecedents and consequents

import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt 

# Generate scatterplot using support and confidence
plt.figure(figsize=(10,6))
sns.scatterplot(x = "support", y = "confidence", 
                size = "lift", data = rules)
plt.margins(0.01,0.01)
plt.show()

## Conclusion

In this post, we have had a glimpse into what Affinity Analysis is and how to implement it in python.
Affinity Analysis or Market Basket Analysis is used to extract valuable insights from transaction data. It can be used to determine what products to discount. Also, it can increase sales and customer satisfaction. It is important to realize that there are many other areas in which it can be applied.