In [None]:
# Mining Association Rules
# By Gabriel P. Oliveira and Iago A. D. Vaz

In [1]:
import pandas as pd
import fim
import ast

# Temporal

In [20]:
def temporal(market_, year_, drop_duplicate = False, target = 'c', minconf = 9, zmin = 2, report = 's'):
    charts = pd.read_csv(f'dataset/charts/{market_}-genre_transactions_charts.csv', sep='\t', encoding='utf-8')
    charts = charts.loc[charts['end_date'].str.contains(str(year_))]
    
    if drop_duplicate:
        charts.drop_duplicates(subset='song_id', keep='first', inplace=True)
    
    transactions = list([ast.literal_eval(x) for x in charts['genre_list']])
        
    return fim.apriori(transactions, target=target, conf=minconf, zmin=zmin, report=report)

In [30]:
markets = ['global', 'au', 'br', 'ca', 'de', 'fr', 'gb', 'jp', 'us']
years = [2017, 2018, 2019]
df_temporal = pd.DataFrame(columns=['consequent','antecedent', 'confidence', 'lift', 'year', 'market'])

target = 'r'
minconf = 20
zmin = 2
report = 'cl'
drop_duplicate = True
ascending = False


for market in markets:
    for year in years:
        freq_pat = temporal(market, year, drop_duplicate, target, minconf, zmin, report)
        df = pd.DataFrame(freq_pat, columns=[ 'consequent','antecedent', 'confidence', 'lift']).sort_values(by="lift", ascending=ascending).head(50)
        df['year'] = year
        df['market'] = market
        df_temporal = pd.concat([df_temporal, df])
        
df_temporal = df_temporal[['market', 'year','antecedent','consequent', 'confidence', 'lift']]

In [15]:
df_temporal.to_csv('dataset/assoc_rules_temporal.csv', index=False)

In [34]:
df_temporal.loc[df_temporal.market.isin(['br']) & (df_temporal.year == 2017)]

Unnamed: 0,market,year,antecedent,consequent,confidence,lift
4,br,2017,"(tropical house,)",house,0.367089,7.666908
0,br,2017,"(brazilian funk, pop)",pagode baiano,0.270833,7.614583
3,br,2017,"(tropical house, pop)",house,0.36,7.518857
1,br,2017,"(tropical house, pop)",dance,0.24,6.747692
16,br,2017,"(tropical house,)",electro house,0.493671,6.682841
15,br,2017,"(tropical house, pop)",electro house,0.493333,6.678272
2,br,2017,"(tropical house,)",dance,0.227848,6.406037
5,br,2017,"(electro, pop)",house,0.278261,5.811677
25,br,2017,"(hip hop, pop)",trap,0.472973,5.762387
19,br,2017,"(electro, pop)",electro house,0.408696,5.532528


In [35]:
df_temporal.loc[df_temporal.market.isin(['br']) & (df_temporal.year == 2018)]

Unnamed: 0,market,year,antecedent,consequent,confidence,lift
1,br,2018,"(electro, pop)",house,0.267327,7.266427
7,br,2018,"(electro, pop)",tropical house,0.445545,6.773788
3,br,2018,"(electro, pop)",electro house,0.346535,6.757426
2,br,2018,"(electro,)",house,0.228571,6.212987
6,br,2018,"(hip hop,)",trap,0.496296,6.183025
5,br,2018,"(rap,)",trap,0.45045,5.611862
8,br,2018,"(electro,)",tropical house,0.364286,5.538378
0,br,2018,"(brazilian funk, pop)",pagode baiano,0.275,5.481667
4,br,2018,"(electro,)",electro house,0.278571,5.432143
12,br,2018,"(hip hop,)",pop rap,0.444444,4.803213


In [36]:
df_temporal.loc[df_temporal.market.isin(['br']) & (df_temporal.year == 2019)]

Unnamed: 0,market,year,antecedent,consequent,confidence,lift
5,br,2019,"(hip hop,)",trap,0.434343,6.187489
6,br,2019,"(brazilian funk, pop)",pagode baiano,0.424658,5.473364
3,br,2019,"(hip hop,)",pop rap,0.30303,5.235332
0,br,2019,"(hip hop,)",r&b,0.262626,4.442761
4,br,2019,"(hip hop,)",rap,0.30303,4.03378
1,br,2019,"(dance pop, pop)",pop rap,0.203883,3.522413
2,br,2019,"(dance pop,)",pop rap,0.203883,3.522413
7,br,2019,"(brazilian funk,)",pagode baiano,0.225,2.9
9,br,2019,"(pop,)",dance pop,0.287709,2.268156
14,br,2019,"(brazilian funk, pop)",electro,0.246575,1.982368


# Estratificada

In [8]:
def estratificada(market_, top_, drop_duplicate = False, target = 'c', minconf = 9, zmin = 2, report = 's'):
    charts = pd.read_csv(f'dataset/charts/{market_}-genre_transactions_charts.csv', sep='\t', encoding='utf-8')
    top_charts = pd.read_csv(f'dataset/charts/{market_}-genre_transactions_top_charts.csv', sep='\t', encoding='utf-8')
    
    charts = charts.merge(top_charts.drop(['num_artists', 'genre_list'], axis=1),on='song_id')
    
    if top_ == 10:
        charts = charts.loc[charts.top10_weeks > 0]
    elif top_ == 30:
        charts = charts.loc[(charts.top10_weeks == 0) & (charts.top30_weeks > 0)]
    elif top_ == 50:
        charts = charts.loc[(charts.top10_weeks == 0) & (charts.top30_weeks == 0) & (charts.top50_weeks > 0)]
    elif top_ == 100:
        charts = charts.loc[(charts.top10_weeks == 0) & (charts.top30_weeks == 0) & (charts.top50_weeks == 0) & (charts.top100_weeks > 0)]
    else: 
        charts = charts.loc[(charts.top10_weeks == 0) & (charts.top30_weeks == 0) & (charts.top50_weeks == 0) & (charts.top100_weeks == 0) & (charts.top200_weeks > 0)]
    
    if drop_duplicate:
        charts.drop_duplicates(subset='song_id', keep='first', inplace=True)
    
    transactions = list([ast.literal_eval(x) for x in charts['genre_list']])
        
    return fim.apriori(transactions, target=target, conf=minconf, zmin=zmin, report=report)

In [9]:
markets = ['global', 'au', 'br', 'ca', 'de', 'fr', 'gb', 'jp', 'us']
tops = [10, 30, 50, 100, 200]
df_estrat = pd.DataFrame(columns=['consequent','antecedent', 'confidence', 'lift', 'top', 'market'])

target = 'r'
minconf = 80
zmin = 2
report = 'cl'
drop_duplicate = True


for market in markets:
    for top in tops:
        freq_pat = estratificada(market, top, drop_duplicate, target, minconf, zmin, report)
        df = pd.DataFrame(freq_pat, columns=['consequent','antecedent', 'confidence', 'lift']).sort_values(by="lift", ascending=ascending).head(50)
        df['top'] = top
        df['market'] = market
        df_estrat = pd.concat([df_estrat, df])
        
df_estrat = df_estrat[['market', 'top','antecedent','consequent', 'confidence', 'lift']]

In [23]:
df_estrat.to_csv('dataset/assoc_rules_estrat.csv', index=False)

In [10]:
df_estrat.loc[df_estrat.market=='br']

Unnamed: 0,market,top,antecedent,consequent,confidence,lift
0,br,10,"(dance pop,)",pop,1.0,1.926316
1,br,10,"(pagode baiano, brazilian funk)",pop,1.0,1.926316
2,br,10,"(pagode baiano, pop)",brazilian funk,1.0,1.926316
3,br,10,"(pagode baiano,)",brazilian funk,1.0,1.926316
4,br,10,"(pagode baiano,)",pop,1.0,1.926316
0,br,30,"(rap,)",hip hop,0.826087,6.278261
3,br,30,"(dance pop,)",pop,1.0,1.740458
2,br,30,"(pop rap,)",pop,0.846154,1.472695
1,br,30,"(electro,)",pop,0.8,1.392366
0,br,50,"(dance pop,)",pop,1.0,2.056075


# Parceria

In [25]:
def colab(market_, num_artists_, drop_duplicate = False, target = 'c', minconf = 9, zmin = 2, report = 's'):
    charts = pd.read_csv(f'dataset/charts/{market_}-genre_transactions_charts.csv', sep='\t', encoding='utf-8')
    
    charts = charts.loc[charts.num_artists >= num_artists_]
  
    
    if drop_duplicate:
        charts.drop_duplicates(subset='song_id', keep='first', inplace=True)
    
    transactions = list([ast.literal_eval(x) for x in charts['genre_list']])
        
    return fim.apriori(transactions, target=target, conf=minconf, zmin=zmin, report=report)

In [26]:
markets = ['global', 'au', 'br', 'ca', 'de', 'fr', 'gb', 'jp', 'us']
num_artists = [1, 2, 5]
df_colab = pd.DataFrame(columns=['consequent', 'antecedent', 'confidence', 'lift', 'colab_size', 'market'])

target = 'r'
minconf = 20
zmin = 2
report = 'cl'
drop_duplicate = True


for market in markets:
    for colab_size in num_artists:
        freq_pat = colab(market, colab_size, drop_duplicate, target, minconf, zmin, report)
        #df = pd.DataFrame(freq_pat, columns=['consequent', 'antecedent', 'confidence', 'lift']).sort_values(by="lift", ascending=ascending).head(50)
        df = pd.DataFrame(freq_pat, columns=['consequent', 'antecedent', 'confidence', 'lift']).sort_values(by="lift", ascending=ascending)
        df = df.loc[df.confidence < 0.5].head(10)
        
        df['colab_size'] = colab_size
        df['market'] = market
        df_colab = pd.concat([df_colab, df])
        
df_colab = df_colab[['market', 'colab_size','antecedent','consequent', 'confidence', 'lift']]

In [24]:
df_colab.to_csv('dataset/assoc_rules_colab.csv', index=False)

In [27]:
df_colab.loc[df_colab.market=='br']

Unnamed: 0,market,colab_size,antecedent,consequent,confidence,lift
0,br,1,"(electro, pop)",house,0.246637,7.566057
3,br,1,"(electro, pop)",electro house,0.372197,7.421614
2,br,1,"(brazilian funk, pop)",pagode baiano,0.336508,6.212934
1,br,1,"(electro,)",house,0.201954,6.19534
6,br,1,"(hip hop,)",trap,0.455172,6.010687
5,br,1,"(rap,)",trap,0.451613,5.963683
4,br,1,"(electro,)",electro house,0.289902,5.780651
8,br,1,"(electro,)",tropical house,0.397394,5.502805
20,br,1,"(hip hop,)",pop rap,0.455172,4.751905
10,br,1,"(dance pop,)",tropical house,0.289157,4.004016
