***
# **Libraries**
***

#### Import needed common libraries

In [36]:
import pandas as pd 
import numpy as np 

import os
print(os.listdir('../BD00-Datasets'))
#pd.set_option('display.float', '{:.5f}'.format)

['csv-datasets', 'db-datasets', 'New folder', '__MACOSX']


***
# **Data retrieval**
***

#### Construct helper function for data connection and queries

In [37]:
import sqlite3 as sql
#conn = sqlite3.connect("../BD00-Datasets/db-datasets/chinook.db")
#c = conn.cursor()
def run_query(q):
    with sql.connect("../BD00-Datasets/db-datasets/chinook.db") as conn:
        return pd.read_sql_query(q, conn)

***
# **Data inspections**
***
#### View data structures
#### Construct needed data into dataset

In [38]:
# inspecting data tables
#run_query("""SELECT Name,type FROM sqlite_master WHERE type='table'""")
tables = run_query("""SELECT Name,type FROM sqlite_master WHERE type='table'""")
tables

Unnamed: 0,name,type
0,Album,table
1,Artist,table
2,Customer,table
3,Employee,table
4,Genre,table
5,Invoice,table
6,InvoiceLine,table
7,MediaType,table
8,Playlist,table
9,PlaylistTrack,table


In [39]:
#run_query("""SELECT * FROM Invoice""").info()
q = """SELECT * FROM Invoice"""
invoice = run_query(q)

invoice.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 412 entries, 0 to 411
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   InvoiceId          412 non-null    int64  
 1   CustomerId         412 non-null    int64  
 2   InvoiceDate        412 non-null    object 
 3   BillingAddress     412 non-null    object 
 4   BillingCity        412 non-null    object 
 5   BillingState       210 non-null    object 
 6   BillingCountry     412 non-null    object 
 7   BillingPostalCode  384 non-null    object 
 8   Total              412 non-null    float64
dtypes: float64(1), int64(2), object(6)
memory usage: 29.1+ KB


In [40]:
#run_query("""SELECT * FROM InvoiceLine""").info()
q = """SELECT * FROM InvoiceLine"""
invoice_line = run_query(q)

invoice_line.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   InvoiceLineId  2240 non-null   int64  
 1   InvoiceId      2240 non-null   int64  
 2   TrackId        2240 non-null   int64  
 3   UnitPrice      2240 non-null   float64
 4   Quantity       2240 non-null   int64  
dtypes: float64(1), int64(4)
memory usage: 87.6 KB


In [41]:
q = """SELECT il.InvoiceId AS InvoiceNo, 
              il.TrackId AS TrackID,
              t.Name AS Tracks,
              g.GenreId AS GenreID,
              g.Name AS Genres,
              il.Quantity,
              il.UnitPrice,
              i.InvoiceDate,
              i.CustomerId AS CustomerID,
              i.BillingCountry AS Country
       FROM InvoiceLine il
       JOIN Invoice i
            ON il.InvoiceId = i.InvoiceId
       JOIN track  t
            ON il.TrackId = t.TrackId
       JOIN genre g
            ON t.GenreId = g.GenreId
    """
df = run_query(q)
df.tail()

Unnamed: 0,InvoiceNo,TrackID,Tracks,GenreID,Genres,Quantity,UnitPrice,InvoiceDate,CustomerID,Country
2235,411,3136,Looking For Love,3,Metal,1,0.99,2013-12-14 00:00:00,44,Finland
2236,411,3145,Sweet Lady Luck,3,Metal,1,0.99,2013-12-14 00:00:00,44,Finland
2237,411,3154,Feirinha da Pavuna/Luz do Repente/Bagaço da La...,7,Latin,1,0.99,2013-12-14 00:00:00,44,Finland
2238,411,3163,Samba pras moças,7,Latin,1,0.99,2013-12-14 00:00:00,44,Finland
2239,412,3177,Hot Girl,19,TV Shows,1,1.99,2013-12-22 00:00:00,58,India


In [42]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
InvoiceNo,2240.0,206.86875,119.134877,1.0,103.0,207.0,311.0,412.0
TrackID,2240.0,1717.734375,993.797999,1.0,874.0,1708.0,2560.0,3500.0
GenreID,2240.0,5.249554,5.690055,1.0,1.0,3.0,7.0,24.0
Quantity,2240.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
UnitPrice,2240.0,1.039554,0.217069,0.99,0.99,0.99,0.99,1.99
CustomerID,2240.0,29.974107,17.018715,1.0,15.0,30.0,45.0,59.0


In [43]:
df.isnull().sum()

InvoiceNo      0
TrackID        0
Tracks         0
GenreID        0
Genres         0
Quantity       0
UnitPrice      0
InvoiceDate    0
CustomerID     0
Country        0
dtype: int64

***
# **Data preparation**
***

One-Hot encode data

In [44]:
#from mlxtend.preprocessing import TransactionEncoder
#te = TransactionEncoder()
#data = te.fit(basket).transform(basket)
#data = pd.DataFrame(data, columns = te.columns_)
#data

basket = (df.groupby(['InvoiceNo', 'Genres'])['Quantity']
          .sum().unstack().reset_index().fillna(0)
          .set_index('InvoiceNo'))
basket

Genres,Alternative,Alternative & Punk,Blues,Bossa Nova,Classical,Comedy,Drama,Easy Listening,Electronica/Dance,Heavy Metal,...,Pop,R&B/Soul,Reggae,Rock,Rock And Roll,Sci Fi & Fantasy,Science Fiction,Soundtrack,TV Shows,World
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
410,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0
411,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket.applymap(encode_units)
basket_sets

Genres,Alternative,Alternative & Punk,Blues,Bossa Nova,Classical,Comedy,Drama,Easy Listening,Electronica/Dance,Heavy Metal,...,Pop,R&B/Soul,Reggae,Rock,Rock And Roll,Sci Fi & Fantasy,Science Fiction,Soundtrack,TV Shows,World
InvoiceNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
408,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
409,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
410,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
411,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0


***
# **Data modeling**
***

Classify data into itemsets

In [46]:
# Import modeling libraries
from mlxtend.frequent_patterns import apriori, fpgrowth
from mlxtend.frequent_patterns import association_rules

In [47]:
frequent_itemsets = apriori(basket_sets, min_support=0.005, use_colnames=True)
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.009709,(Alternative)
1,0.225728,(Alternative & Punk)
2,0.065534,(Blues)
3,0.016990,(Bossa Nova)
4,0.036408,(Classical)
...,...,...
191,0.007282,"(Rock, Latin, Reggae, Soundtrack)"
192,0.007282,"(Pop, Rock, Reggae, Soundtrack)"
193,0.009709,"(Alternative & Punk, Latin, Easy Listening, Ro..."
194,0.007282,"(Alternative & Punk, Jazz, Metal, Rock And Rol..."


In [48]:
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
frequent_itemsets

Unnamed: 0,support,itemsets,length
0,0.009709,(Alternative),1
1,0.225728,(Alternative & Punk),1
2,0.065534,(Blues),1
3,0.016990,(Bossa Nova),1
4,0.036408,(Classical),1
...,...,...,...
191,0.007282,"(Rock, Latin, Reggae, Soundtrack)",4
192,0.007282,"(Pop, Rock, Reggae, Soundtrack)",4
193,0.009709,"(Alternative & Punk, Latin, Easy Listening, Ro...",5
194,0.007282,"(Alternative & Punk, Jazz, Metal, Rock And Rol...",5


In [49]:
frequent_itemsets[ (frequent_itemsets['length'] >= 2) &
                   (frequent_itemsets['support'] >= 0.005) ]

Unnamed: 0,support,itemsets,length
24,0.007282,"(Classical, Alternative)",2
25,0.019417,"(Alternative & Punk, Blues)",2
26,0.009709,"(Alternative & Punk, Easy Listening)",2
27,0.007282,"(Alternative & Punk, Electronica/Dance)",2
28,0.007282,"(Alternative & Punk, Hip Hop/Rap)",2
...,...,...,...
191,0.007282,"(Rock, Latin, Reggae, Soundtrack)",4
192,0.007282,"(Pop, Rock, Reggae, Soundtrack)",4
193,0.009709,"(Alternative & Punk, Latin, Easy Listening, Ro...",5
194,0.007282,"(Alternative & Punk, Jazz, Metal, Rock And Rol...",5


***
# **Calculation**
***

Build rules for calculation

In [50]:
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Classical),(Alternative),0.036408,0.009709,0.007282,0.200000,20.600000,0.006928,1.237864
1,(Alternative),(Classical),0.009709,0.036408,0.007282,0.750000,20.600000,0.006928,3.854369
2,(Alternative & Punk),(Blues),0.225728,0.065534,0.019417,0.086022,1.312624,0.004625,1.022416
3,(Blues),(Alternative & Punk),0.065534,0.225728,0.019417,0.296296,1.312624,0.004625,1.100281
4,(Alternative & Punk),(Easy Listening),0.225728,0.009709,0.009709,0.043011,4.430108,0.007517,1.034799
...,...,...,...,...,...,...,...,...,...
973,(Latin),"(Rock, Reggae, Pop, Soundtrack)",0.283981,0.007282,0.007282,0.025641,3.521368,0.005214,1.018843
974,(Reggae),"(Latin, Rock, Pop, Soundtrack)",0.031553,0.009709,0.007282,0.230769,23.769231,0.006975,1.287379
975,(Pop),"(Latin, Rock, Reggae, Soundtrack)",0.031553,0.007282,0.007282,0.230769,31.692308,0.007052,1.290534
976,(Rock),"(Latin, Reggae, Pop, Soundtrack)",0.524272,0.007282,0.007282,0.013889,1.907407,0.003464,1.006700


In [51]:
#rules[ (rules['lift'] >= 12) 
rules[ (rules['confidence'] >= 0.79) 
     & (rules['confidence'] != 1) ]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
24,(Heavy Metal),(Blues),0.012136,0.065534,0.009709,0.8,12.207407,0.008913,4.67233
59,(Electronica/Dance),(Rock),0.01699,0.524272,0.014563,0.857143,1.634921,0.005656,3.330097
87,(World),(Latin),0.021845,0.283981,0.019417,0.888889,3.130104,0.013214,6.444175
103,(Reggae),(Rock),0.031553,0.524272,0.026699,0.846154,1.61396,0.010156,3.092233
111,(World),(Rock),0.021845,0.524272,0.019417,0.888889,1.695473,0.007965,4.281553
198,"(Alternative & Punk, Latin)",(Rock),0.048544,0.524272,0.038835,0.8,1.525926,0.013385,2.378641
261,"(Heavy Metal, Metal)",(Blues),0.012136,0.065534,0.009709,0.8,12.207407,0.008913,4.67233
263,(Heavy Metal),"(Blues, Metal)",0.012136,0.021845,0.009709,0.8,36.622222,0.009444,4.890777
266,"(Heavy Metal, Rock)",(Blues),0.012136,0.065534,0.009709,0.8,12.207407,0.008913,4.67233
269,(Heavy Metal),"(Blues, Rock)",0.012136,0.031553,0.009709,0.8,25.353846,0.009326,4.842233


In [53]:
print(f"Let's check one of the combination:")
print(f"{'='*50}")
print(f"\tAntecedents\t: Latin ({basket['Latin'].sum():.0f} sold)\n\t\t\t  Metal ({basket['Metal'].sum():.0f} sold)") 
print(f"{'-'*50}")
print(f"\tConsequents\t: Rock ({basket['Rock'].sum():.0f} sold)")
print(f"{'='*50}")
print(f"\tMetrics value\t: Support\t{rules.iloc[394]['support']:.3f}")
print(f"\t\t\t  Confidence\t{rules.iloc[394]['confidence']:.3f}")
print(f"\t\t\t  Lift\t\t{rules.iloc[394]['lift']:.3f}")
print(f"\t\t\t  Conviction\t{rules.iloc[394]['conviction']:.3f}")
print(f"{'='*50}")
#rules.loc[[813]]

Let's check one of the combination:
	Antecedents	: Latin (386 sold)
			  Metal (264 sold)
--------------------------------------------------
	Consequents	: Rock (835 sold)
	Metrics value	: Support	0.049
			  Confidence	0.800
			  Lift		1.526
			  Conviction	2.379
