In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from mlxtend.frequent_patterns import apriori, association_rules
from google.colab import files
import os

# define the file name
file_name = 'PartB-review-text-setB.csv'

# check the file from server
if not os.path.exists(file_name):
    # if the file not exist, prompt file upload
    print(f"Please upload the file:")
    uploaded = files.upload()

    file_name = list(uploaded.keys())[0]

# load the dataset
data = pd.read_csv(file_name)

# init CountVectorizer
vectorizer = CountVectorizer(binary=True, stop_words="english")

# dictionary to store results
rating_association_rules = {}

# process data for each rating group
for rating, group in data.groupby("Rating"):
    # vectorize review texts
    X = vectorizer.fit_transform(group["Review Text"].dropna())
    feature_names = vectorizer.get_feature_names_out()

    # convert to DataFrame for apriori
    review_matrix = pd.DataFrame(X.toarray(), columns=feature_names).astype(bool)

    # apply Apriori
    frequent_itemsets = apriori(review_matrix, min_support=0.1, use_colnames=True)

    # generate association rules
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3, num_itemsets=None)

    # save rules in dictionary
    rating_association_rules[rating] = rules

# display association rules
for rating, rules in rating_association_rules.items():
    print(f"Association Rules for Rating {rating}:\n")
    print(rules)
    print("\n" + "="*50 + "\n")  # separator for each rating

  and should_run_async(code)


Association Rules for Rating 1:

  antecedents consequents  antecedent support  consequent support   support  \
0      (like)     (looks)            0.340909            0.164336  0.104895   
1     (looks)      (like)            0.164336            0.340909  0.104895   

   confidence     lift  representativity  leverage  conviction  zhangs_metric  \
0    0.307692  1.87234               1.0  0.048872    1.207071       0.706897   
1    0.638298  1.87234               1.0  0.048872    1.822193       0.557531   

    jaccard  certainty  kulczynski  
0  0.262009   0.171548    0.472995  
1  0.262009   0.451211    0.472995  


Association Rules for Rating 2:

  antecedents consequents  antecedent support  consequent support   support  \
0     (dress)      (like)            0.246765            0.355823  0.104436   
1      (just)      (like)            0.254159            0.355823  0.106285   

   confidence      lift  representativity  leverage  conviction  \
0    0.423221  1.189416           

Support: 0.1,
Confidence: 0.3

# Findings and Interpretation:
**Low Rating (Rating 1 and 2)**
- Frequent Association: looks like, just like
- Interpretation: Customer who gave rating "just like" and "looks like" usually reflects their dissatisfaction to the product style.


**Medium Rating (Rating 3)**
- Frequent Association: size fit
- Interpretation: Customer indicates the the size of product is fit, but there might more improvement can be made to the product.


**High Rating (Rating 4 and 5)**
- Frequent Association: size fit, love fit, great love
- Interpretation: Other than size fit which suggest that the general satisfaction in sizing, there also have words that indicates a strong positive impression towards the product