  # Frequent Pattern Mining

### Imports

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

In [None]:
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

In [None]:
%store -r predict_dataset

In [None]:
predict_df = predict_dataset

In [None]:
def generate_appwords():
    stpwrds = ["app", "alexa", "facebook", 
                     "googlehome", "instagram", "linkedin", "tiktok", "tik", "tok", "uber", "youtube", "fb", 
               "dont", "yall", "kinda", "lot", "anymore", "doesnt", "tube", "blm", "thing"]
    return stpwrds

In [None]:
def remove_stopwords(text, stpwrds):
    #text = text.split(" ")
    words = [w for w in text if w not in stpwrds]
    return words

In [None]:
stpwrds = generate_appwords()
predict_df['clean_content'] = predict_df['clean_content'].apply(lambda x: remove_stopwords(x, stpwrds))

In [None]:
predict_df['clean_content']

In [None]:
# Create an empty dictionary to store the sub dataframes for each app
app_subdfs = {}

# Iterate over each unique app name
for app in predict_df['app_name'].unique():
    # Filter the dataframe for the current app
    sub_df = predict_df[predict_df['app_name'] == app]
    # Store the sub dataframe in the dictionary with the app name as the key
    app_subdfs[app] = sub_df

### One-hot encoding transaction data
corpus_list here transformed into a one-hot encoded data frame, where each column consists of true and false values that indicate whether a word was included in a review. 

In [None]:
app_corpus_list = {}
def create_corpus(df,  concern: bool = False):
    for app in app_subdfs.keys():
        if concern:
            for index, row in app_subdfs[app].iterrows():
                row["clean_content"].append(row["predicted"])
        corpus_list = app_subdfs[app]["clean_content"].tolist()
        app_corpus_list[app] = corpus_list
    return app_corpus_list

app_corpus_list = create_corpus(app_subdfs, concern=True)
#app_corpus_list

In [None]:
# Create an empty dictionary to store the sub dataframes for each app
app_corpus_subdfs = {}
te = TransactionEncoder()
for app in app_subdfs.keys():
    print(app)
    te_ary = te.fit(app_corpus_list[app]).transform(app_corpus_list[app])
    corpus_df = pd.DataFrame(te_ary, columns=te.columns_)
    app_corpus_subdfs[app] = corpus_df
    print(corpus_df.shape)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Create a list to store the number of unique words for each app
unique_words_app = {}

# Iterate over the sub-dataframes
for app, sub_df in app_corpus_subdfs.items():
    unique_words_app[app] = len(sub_df.columns)
unique_words_app = sorted(unique_words_app.items(), key=lambda x: x[1], reverse=True)
unique_words_app = dict(unique_words_app)
# Sort the unique_words_app dictionary by values in descending order
unique_words_app_list = sorted(unique_words_app.items(), key=lambda x: x[1], reverse=False)

# Create a list to store the number of reviews for each app
no_review_app = {}

# Iterate over the sub-dataframes
for app, sub_df in app_corpus_subdfs.items():
    no_review_app[app] = len(sub_df.index)
no_review_app = sorted(no_review_app.items(), key=lambda x: x[1], reverse=True)
no_review_app = dict(no_review_app)
# Sort the no_review_app dictionary by values in descending order
no_review_app_list = sorted(no_review_app.items(), key=lambda x: x[1], reverse=False)

# Extract the sorted apps and counts for unique words
apps_unique_words = [app for app, count in unique_words_app_list]
counts_unique_words = [count for app, count in unique_words_app_list]

# Extract the sorted apps and counts for reviews
apps_reviews = [app for app, count in no_review_app_list]
counts_reviews = [count for app, count in no_review_app_list]

# Set the width of the bars
bar_width = 0.45

# Create an array of indices for the x-axis ticks
ind = np.arange(len(apps_unique_words))

# Create the figure and axes
fig, ax = plt.subplots(figsize=(8, 6))

# Plot the bars for unique words
bars_unique_words = ax.barh(ind, counts_unique_words, bar_width, color='#1f77b4', label='Unique Words')

# Plot the bars for reviews
bars_reviews = ax.barh(ind + bar_width, counts_reviews, bar_width, color='#0f3652', label='Reviews')

# Add value labels to each bar for unique words
for count, bar in zip(counts_unique_words, bars_unique_words):
    ax.text(bar.get_width() + 50 , bar.get_y() + bar.get_height() / 2 , str(count), ha='right', va='center', fontsize=12)

# Add value labels to each bar for reviews
for count, bar in zip(counts_reviews, bars_reviews):
    ax.text(bar.get_width() + 50 , bar.get_y() + bar.get_height() / 2 , str(count), ha='right', va='center', fontsize=12)

# Set the y-axis ticks and labels
ax.set_yticks(ind + bar_width / 2)
ax.set_yticklabels(apps_unique_words)

plt.xlabel('Count', fontsize=13)  # Increase font size to 12
plt.ylabel('Apps', fontsize=13)  # Increase font size to 12

# Increase font size of tick labels on both axes
ax.tick_params(axis='x', labelsize=13)  # Increase x-axis tick label font size to 10
ax.tick_params(axis='y', labelsize=13)  # Increase y-axis tick label font size to 10

# Set the chart title
#ax.set_title('Number of Unique Words and Reviews for Each App')
plt.xlim(0,899)
# Set the legend
ax.legend()

# Display the chart
plt.show()


### Remove some apps

In [None]:
del app_corpus_subdfs['googlehome']
del app_corpus_subdfs['zoom']
del app_corpus_subdfs['linkedin']
del app_corpus_subdfs['instagram']
del app_corpus_subdfs['alexa']
del app_corpus_subdfs['vinted']

In [None]:
app_corpus_subdfs.keys()

In [None]:
app_corpus_subdfs.keys()

### Frequency of the words & Support metric

In [None]:
te = TransactionEncoder()
app_word_counts_subdfs = {}
app_support_counts_subdfs = {}
for app in app_corpus_subdfs.keys():
    word_counts = app_corpus_subdfs[app].sum()
# Create a new dataframe to store the word counts
    word_counts_df = pd.DataFrame({'Word': word_counts.index, 'Count': word_counts.values})

# Calculate the total number of transactions
    total_transactions = len(app_corpus_subdfs[app].index)
# Calculate the support for each word
    word_counts_df['Support'] = word_counts_df['Count'] / total_transactions
    
# Sort the dataframe by the word support in descending order
    word_counts_df = word_counts_df.sort_values('Support', ascending=False)
    app_word_counts_subdfs[app] = word_counts_df
    #print(app_word_counts_subdfs)
    support_counts = word_counts_df.groupby(word_counts_df['Support'].round(3))['Word'].nunique().reset_index()
    app_support_counts_subdfs[app] = support_counts
    #print(app_support_counts_subdfs)


# Print the DataFrame with word counts and their respective supports
#word_counts_df
#app_support_counts_subdfs['youtube']


In [None]:
# Plot the number of unique words for each support value
import matplotlib.ticker as mticker

plt.figure(figsize=(8, 6))

color_palette = sns.color_palette("tab20", len(app_support_counts_subdfs.keys()))
for app, color in zip(app_support_counts_subdfs.keys(), color_palette):
    app_df = app_support_counts_subdfs[app]
    plt.plot(app_df['Support'], app_df['Word'], color=color,  label=app)

plt.xlabel('Support', fontsize=12)  # Increase font size to 12
plt.ylabel('Number of unique words', fontsize=12)  # Increase font size to 12

# Increase font size of tick labels on both axes
ax.tick_params(axis='x', labelsize=14)  # Increase x-axis tick label font size to 10
ax.tick_params(axis='y', labelsize=14)  # Increase y-axis tick label font size to 10
#plt.title('Number of Unique Words for Support Value')
#plt.xticks(minor=True)
#plt.yscale('log')
# Format x-axis as percentages
#plt.gca().xaxis.set_major_formatter('{:.3f}'.format)
plt.xlim(0,0.15)
plt.legend(title='Apps')
plt.show()


### Finding frequent itemsets with Apriori

In [None]:
frequent_itemsets_subdfs = {}
start_time = time.time()
# max_len = 2 could be used to get only top rules
support_app = [0.02, 0.04, 0.04, 0.04]
#for app in app_corpus_subdfs:
for app, support in zip(app_corpus_subdfs, support_app):
    print(app, support, len(app_corpus_subdfs[app]))
    frequent_itemsets = apriori(app_corpus_subdfs[app], min_support = support, use_colnames=True, max_len=2, low_memory=True)
    ## 0.001 rumtime error

    frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))
    print("the number of frequent itemsets generated:", len(frequent_itemsets))
    #frequent_itemsets = frequent_itemsets[frequent_itemsets['length']> 1]
    frequent_itemsets_subdfs[app] = frequent_itemsets
print("---Runtime: %s seconds ---" % (time.time() - start_time))

In [None]:
frequent_itemsets_counts_subdfs = {}
for app in frequent_itemsets_subdfs:
    frequent_itemsets_counts = frequent_itemsets_subdfs[app].groupby(['length']).size().reset_index(name='no. itemsets')
    frequent_itemsets_counts_subdfs[app] = frequent_itemsets_counts
    
# Convert dictionary to DataFrame
frequent_itemsets_all_df = pd.concat({k: pd.DataFrame(v) for k, v in frequent_itemsets_counts_subdfs.items()}, axis=0)

# Reset index
frequent_itemsets_all_df.reset_index(level=1, inplace=True)
frequent_itemsets_all_df.rename(columns={'level_1': 'app'}, inplace=True)
frequent_itemsets_all_df.to_csv('frequent_itemsets_counts.csv')
frequent_itemsets_all_df


### Generating association rules
Association rules are genenrated with no additional pruning for now at this step.

In [None]:
# Generate association rules without performing additional pruning
rules_subdfs = {}
for app in frequent_itemsets_subdfs:
    rules = association_rules(frequent_itemsets_subdfs[app], metric='support', min_threshold = 0.0000000)
    print(app, len(rules))
    rules_subdfs[app] = rules
#rules_subdfs

In [None]:
for app in rules_subdfs.keys():
    print(app)
    rules_subdfs[app].describe().to_csv(f'rules_description_two_itemsets{app}.csv')
    print(len(rules_subdfs[app]))
    
    #print(rules_subdfs[app].describe())


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

sns.scatterplot(ax=axes[0, 0], x="antecedent support", y="consequent support", size="confidence", hue="confidence",
                data=rules_subdfs['tiktok'], palette=sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True), s=50)
axes[0, 0].set_xlabel("antecedent support", fontsize=14)
axes[0, 0].set_ylabel("consequent support", fontsize=14)
axes[0, 0].tick_params(axis='both', which='major', labelsize=14)  # Adjust font size of tick labels

sns.scatterplot(ax=axes[0, 1], x="antecedent support", y="consequent support", size="confidence", hue="confidence",
                data=rules_subdfs['facebook'], palette=sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True),
                s=50)
axes[0, 1].set_xlabel("antecedent support", fontsize=14)
axes[0, 1].set_ylabel("consequent support", fontsize=14)
axes[0, 1].tick_params(axis='both', which='major', labelsize=14)

sns.scatterplot(ax=axes[1, 0], x="antecedent support", y="consequent support", size="confidence", hue="confidence",
                data=rules_subdfs['uber'], palette=sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True),
                s=50)
axes[1, 0].set_xlabel("antecedent support", fontsize=14)
axes[1, 0].set_ylabel("consequent support", fontsize=14)
axes[1, 0].tick_params(axis='both', which='major', labelsize=14)

sns.scatterplot(ax=axes[1, 1], x="antecedent support", y="consequent support", size="confidence", hue="confidence",
                data=rules_subdfs['youtube'], palette=sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True),
                s=50)
axes[1, 1].set_xlabel("antecedent support", fontsize=14)
axes[1, 1].set_ylabel("consequent support", fontsize=14)
axes[1, 1].tick_params(axis='both', which='major', labelsize=14)

plt.show()


In [None]:
# Generate association rules without performing additional pruning
for app in rules_subdfs.keys():
    print(app, len(rules_subdfs[app]))
    rules = rules_subdfs[app][rules_subdfs[app]['antecedent support'] <= rules_subdfs[app]['consequent support']]
    rules_subdfs[app] = rules
    print('greater consequent support than antecedent support', app, len(rules_subdfs[app]))
#rules_subdfs  

### Optimality of the support-confidence border

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

sns.scatterplot(ax=axes[0, 0], x="support", y="confidence", size="lift", hue="lift",
                data=rules_subdfs['tiktok'], palette=sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True))
axes[0, 0].set_xlabel("support", fontsize=14)
axes[0, 0].set_ylabel("confidence", fontsize=14)
axes[0, 0].tick_params(axis='both', which='major', labelsize=14)

sns.scatterplot(ax=axes[0, 1], x="support", y="confidence", size="lift", hue="lift",
                data=rules_subdfs['facebook'], palette=sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True))
axes[0, 1].set_xlabel("support", fontsize=14)
axes[0, 1].set_ylabel("confidence", fontsize=14)
axes[0, 1].tick_params(axis='both', which='major', labelsize=14)

sns.scatterplot(ax=axes[1, 0], x="support", y="confidence", size="lift", hue="lift",
                data=rules_subdfs['uber'], palette=sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True))
axes[1, 0].set_xlabel("support", fontsize=14)
axes[1, 0].set_ylabel("confidence", fontsize=14)
axes[1, 0].tick_params(axis='both', which='major', labelsize=14)

sns.scatterplot(ax=axes[1, 1], x="support", y="confidence", size="lift", hue="lift",
                data=rules_subdfs['youtube'], palette=sns.color_palette("ch:start=.2,rot=-.3", as_cmap=True))
axes[1, 1].set_xlabel("support", fontsize=14)
axes[1, 1].set_ylabel("confidence", fontsize=14)
axes[1, 1].tick_params(axis='both', which='major', labelsize=14)

plt.show()


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

sns.scatterplot(ax=axes[0, 0], x="zhangs_metric", y="lift", color='#1f77b4', data=rules_subdfs['tiktok'])
axes[0, 0].set_xlabel('zhang', fontsize=14)
axes[0, 0].set_ylabel('lift', fontsize=14)
axes[0, 0].tick_params(axis='both', which='major', labelsize=14)

sns.scatterplot(ax=axes[0, 1], x="zhangs_metric", y="lift", color='#1f77b4', data=rules_subdfs['facebook'])
axes[0, 1].set_xlabel('zhang', fontsize=14)
axes[0, 1].set_ylabel('lift', fontsize=14)
axes[0, 1].tick_params(axis='both', which='major', labelsize=14)

sns.scatterplot(ax=axes[1, 0], x="zhangs_metric", y="lift", color='#1f77b4', data=rules_subdfs['uber'])
axes[1, 0].set_xlabel('zhang', fontsize=14)
axes[1, 0].set_ylabel('lift', fontsize=14)
axes[1, 0].tick_params(axis='both', which='major', labelsize=14)

sns.scatterplot(ax=axes[1, 1], x="zhangs_metric", y="lift", color='#1f77b4', data=rules_subdfs['youtube'])
axes[1, 1].set_xlabel('zhang', fontsize=14)
axes[1, 1].set_ylabel('lift', fontsize=14)
axes[1, 1].tick_params(axis='both', which='major', labelsize=14)

fig.suptitle('Relationship Between Support and Confidence in Association Rules', fontsize=16)

plt.show()


## Disassociation rules

In [None]:
#rules_subdfs['tiktok'][rules_subdfs['tiktok']['antecedent support'] <= rules_subdfs['tiktok']['consequent support']]
#rules_subdfs['tiktok'][rules_subdfs['tiktok']['zhangs_metric'] <= 0] 
#rules_subdfs['facebook'][rules_subdfs['facebook']['zhangs_metric'] <= 0]  
#rules_subdfs['uber'][rules_subdfs['uber']['zhangs_metric'] <= 0]  
#rules_subdfs['youtube'][rules_subdfs['youtube']['zhangs_metric'] <= 0] 

for app in rules_subdfs.keys():
    print(app)
    rules_subdfs[app][rules_subdfs[app]['zhangs_metric'] <= 0].to_csv(f'disassociation_rules_{app}.csv')
    print(len(rules_subdfs[app][rules_subdfs[app]['zhangs_metric'] <= 0]))


# Strong Association Rules

## TikTok

In [None]:
strong_rules_tiktok = rules_subdfs['tiktok'][(rules_subdfs['tiktok']['zhangs_metric'] >= 0.5) & 
                                               (rules_subdfs['tiktok']['lift'] >= 2) &
                                            (rules_subdfs['tiktok']['confidence'] >= 0.4)]
print('All rules: ', len(rules_subdfs['tiktok']))
print('Strong rules: ', len(strong_rules_tiktok))

strong_rules_tiktok .sort_values('zhangs_metric', ascending=True)

## Facebook

In [None]:
strong_rules_facebook = rules_subdfs['facebook'][(rules_subdfs['facebook']['zhangs_metric'] >= 0.4) &
                                                 (rules_subdfs['facebook']['lift'] >= 2)&
                                                 (rules_subdfs['facebook']['confidence'] >= 0.3)]
print('All rules: ', len(rules_subdfs['facebook']))
print('Strong rules: ', len(strong_rules_facebook))

strong_rules_facebook.sort_values('confidence', ascending=True)


## Uber

In [None]:
strong_rules_uber = rules_subdfs['uber'][(rules_subdfs['uber']['zhangs_metric'] >= 0.4) & 
                                         (rules_subdfs['uber']['confidence'] >= 0.3) & 
                                               (rules_subdfs['uber']['lift'] >= 2)]
print('All rules: ', len(rules_subdfs['uber']))
print('Strong rules: ', len(strong_rules_uber))

strong_rules_uber.sort_values('confidence', ascending=True)


## Youtube

In [None]:
strong_rules_youtube = rules_subdfs['youtube'][(rules_subdfs['youtube']['zhangs_metric'] >= 0.7) & 
                                               (rules_subdfs['youtube']['confidence'] >= 0.5) & 
                                               (rules_subdfs['youtube']['lift'] >= 2)]
                                               
print('All rules: ', len(rules_subdfs['youtube']))
print('Strong rules: ', len(strong_rules_youtube))

strong_rules_youtube.sort_values('zhangs_metric', ascending=True)


In [None]:
strong_rules_tiktok.antecedents = strong_rules_tiktok.antecedents.apply(lambda x: next(iter(x)))
strong_rules_tiktok.consequents = strong_rules_tiktok.consequents.apply(lambda x: next(iter(x)))
strong_rules_tiktok.to_csv('strong_rules_tiktok.csv')

strong_rules_facebook.antecedents = strong_rules_facebook.antecedents.apply(lambda x: next(iter(x)))
strong_rules_facebook.consequents = strong_rules_facebook.consequents.apply(lambda x: next(iter(x)))
strong_rules_facebook.to_csv('strong_rules_facebook.csv')

strong_rules_uber.antecedents = strong_rules_uber.antecedents.apply(lambda x: next(iter(x)))
strong_rules_uber.consequents = strong_rules_uber.consequents.apply(lambda x: next(iter(x)))
strong_rules_uber.to_csv('strong_rules_uber.csv')

strong_rules_youtube.antecedents = strong_rules_youtube.antecedents.apply(lambda x: next(iter(x)))
strong_rules_youtube.consequents = strong_rules_youtube.consequents.apply(lambda x: next(iter(x)))
strong_rules_youtube.to_csv('strong_rules_youtube.csv')