In [None]:
# Mining Frequent Itemsets
# By Gabriel P. Oliveira and Iago A. D. Vaz

In [186]:
import pandas as pd
import fim
import ast

# Temporal

In [187]:
def temporal(market_, year_, drop_duplicate = False, target = 'c', minsup = 9, zmin = 2, report = 's'):
    charts = pd.read_csv(f'dataset/charts/{market_}-genre_transactions_charts.csv', sep='\t', encoding='utf-8')
    charts = charts.loc[charts['end_date'].str.contains(str(year_))]
    
    if drop_duplicate:
        charts.drop_duplicates(subset='song_id', keep='first', inplace=True)
    
    transactions = list([ast.literal_eval(x) for x in charts['genre_list']])
        
    return fim.apriori(transactions, target=target, supp=minsup, zmin=zmin, report=report)

In [198]:
markets = ['global', 'au', 'br', 'ca', 'de', 'fr', 'gb', 'jp', 'us']
years = [2017, 2018, 2019]
df_temporal = pd.DataFrame(columns=['pattern', 'support', 'year', 'market'])

for market in markets:
    for year in years:
        freq_pat = temporal(market_=market, year_=year, drop_duplicate=False, target='c', minsup=6, zmin=2, report='s')
        df = pd.DataFrame(freq_pat, columns=['pattern', 'support']).sort_values(by="support", ascending=False).head(5)
        df['year'] = year
        df['market'] = market
        df_temporal = pd.concat([df_temporal, df])

In [None]:
df_colab.to_csv('dataset/freq_set_temp.csv', index=False)

In [88]:
df_temporal.loc[df_temporal.market.isin(['ca','us','global'])]

df_temporal.loc[df_temporal.market.isin(['gb','au'])]

df_temporal.loc[df_temporal.market.isin(['br'])]

df_temporal.loc[df_temporal.market.isin(['jp'])]

df_temporal.loc[df_temporal.market.isin(['de', 'fr'])]

Unnamed: 0,pattern,support,year,market
86,"(dance pop, pop)",0.3927,2017,global
71,"(rap, hip hop)",0.2495,2017,global
76,"(pop rap, hip hop)",0.2166,2017,global
65,"(rap, pop rap)",0.2133,2017,global
66,"(rap, pop rap, hip hop)",0.1942,2017,global
52,"(dance pop, pop)",0.294904,2018,global
53,"(rap, hip hop)",0.285769,2018,global
45,"(pop rap, rap)",0.22625,2018,global
40,"(trap, hip hop)",0.202981,2018,global
49,"(pop rap, hip hop)",0.198846,2018,global


# Estratificada

In [154]:
def estratificada(market_, top_, drop_duplicate = False, target = 'c', minsup = 9, zmin = 2, report = 's'):
    charts = pd.read_csv(f'dataset/charts/{market_}-genre_transactions_charts.csv', sep='\t', encoding='utf-8')
    top_charts = pd.read_csv(f'dataset/charts/{market_}-genre_transactions_top_charts.csv', sep='\t', encoding='utf-8')
    
    charts = charts.merge(top_charts.drop(['num_artists', 'genre_list'], axis=1),on='song_id')
    
    if top_ == 10:
        charts = charts.loc[charts.top10_weeks > 0]
    elif top_ == 30:
        charts = charts.loc[(charts.top10_weeks == 0) & (charts.top30_weeks > 0)]
    elif top_ == 50:
        charts = charts.loc[(charts.top10_weeks == 0) & (charts.top30_weeks == 0) & (charts.top50_weeks > 0)]
    elif top_ == 100:
        charts = charts.loc[(charts.top10_weeks == 0) & (charts.top30_weeks == 0) & (charts.top50_weeks == 0) & (charts.top100_weeks > 0)]
    else: 
        charts = charts.loc[(charts.top10_weeks == 0) & (charts.top30_weeks == 0) & (charts.top50_weeks == 0) & (charts.top100_weeks == 0) & (charts.top200_weeks > 0)]
    
    if drop_duplicate:
        charts.drop_duplicates(subset='song_id', keep='first', inplace=True)
    
    transactions = list([ast.literal_eval(x) for x in charts['genre_list']])
        
    return fim.apriori(transactions, target=target, supp=minsup, zmin=zmin, report=report)

In [214]:
markets = ['global', 'au', 'br', 'ca', 'de', 'fr', 'gb', 'jp', 'us']
tops = [10, 30, 50, 100, 200]
df_estrat = pd.DataFrame(columns=['pattern', 'support', 'top', 'market'])

for market in markets:
    for top in tops:
        freq_pat = estratificada(market_=market, top_=top, drop_duplicate=True, target='c', minsup=6, zmin=2, report='s')
        df = pd.DataFrame(freq_pat, columns=['pattern', 'support']).sort_values(by="support", ascending=False).head(5)
        df['top'] = top
        df['market'] = market
        df_estrat = pd.concat([df_estrat, df])

In [217]:
df_estrat.to_csv('dataset/freq_set_estratif.csv', index=False)

In [215]:
df_estrat.loc[df_estrat.market=='br']

Unnamed: 0,pattern,support,top,market
9,"(brazilian funk, pop)",0.322404,10,br
8,"(pagode baiano, brazilian funk, pop)",0.153005,10,br
5,"(sertanejo, brazilian funk)",0.142077,10,br
1,"(dance pop, pop)",0.131148,10,br
4,"(electro, pop)",0.120219,10,br
14,"(brazilian funk, pop)",0.223684,30,br
13,"(dance pop, pop)",0.179825,30,br
7,"(electro, pop)",0.105263,30,br
12,"(pop rap, pop)",0.096491,30,br
2,"(rap, hip hop)",0.083333,30,br


# Parceria

In [218]:
def colab(market_, num_artists_, drop_duplicate = False, target = 'c', minsup = 9, zmin = 2, report = 's'):
    charts = pd.read_csv(f'dataset/charts/{market_}-genre_transactions_charts.csv', sep='\t', encoding='utf-8')
    
    if num_artists_ ==1:
        charts = charts.loc[charts.num_artists == num_artists_]
    else:
        charts = charts.loc[charts.num_artists >= num_artists_]
  
    
    if drop_duplicate:
        charts.drop_duplicates(subset='song_id', keep='first', inplace=True)
    
    transactions = list([ast.literal_eval(x) for x in charts['genre_list']])
        
    return fim.apriori(transactions, target=target, supp=minsup, zmin=zmin, report=report)

In [219]:
markets = ['global', 'au', 'br', 'ca', 'de', 'fr', 'gb', 'jp', 'us']
num_artists = [1, 2, 5]
df_colab = pd.DataFrame(columns=['pattern', 'support', 'colab_size', 'market'])

for market in markets:
    for colab_size in num_artists:
        freq_pat = colab(market_=market, num_artists_=colab_size, drop_duplicate=True, target='c', minsup=3, zmin=2, report='s')
        df = pd.DataFrame(freq_pat, columns=['pattern', 'support']).sort_values(by="support", ascending=False).head(5)
        df['colab_size'] = colab_size
        df['market'] = market
        df_colab = pd.concat([df_colab, df])

In [222]:
df_colab.to_csv('dataset/freq_set_colab.csv', index=False)

In [211]:
df_colab.loc[df_colab.market=='br']

Unnamed: 0,pattern,support,colab_size,market
8,"(dance pop, pop)",0.170543,1,br
6,"(brazilian funk, pop)",0.091301,1,br
5,"(brazilian funk, sertanejo)",0.047373,1,br
3,"(rap, hip hop)",0.042205,1,br
7,"(sertanejo, pop)",0.040482,1,br
158,"(dance pop, pop)",0.255702,2,br
154,"(brazilian funk, pop)",0.2509,2,br
157,"(electro, pop)",0.230492,2,br
151,"(hip hop, pop)",0.160864,2,br
146,"(pop rap, pop)",0.154862,2,br
