In [15]:
import pandas as pd
import ast
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules




In [13]:
df_transactions = pd.read_csv('datasets/transactions.csv')
df_transactions['tags'] = df_transactions['tags'].apply(ast.literal_eval)

transactions = df_transactions['tags'].tolist()

te = TransactionEncoder()
df_onehot = pd.DataFrame(
    te.fit(transactions).transform(transactions),
    columns=te.columns_
)


print(df_onehot.head())
print("Total tags (columns):", df_onehot.shape[1])


   1930s  1970s  1980s    80s  absurd  action  action packed  adaptation  \
0  False  False  False  False   False   False          False       False   
1  False  False  False  False   False   False          False       False   
2  False  False  False  False   False   False          False       False   
3  False  False  False  False   False   False          False       False   
4  False  False  False  False   False   False          False       False   

   adapted from:book  adolescence  ...  visuals    war  wartime  weapons  \
0              False        False  ...    False  False    False    False   
1              False        False  ...    False  False    False    False   
2              False        False  ...    False  False    False    False   
3              False        False  ...    False  False    False    False   
4              False        False  ...    False  False    False    False   

   weird  whimsical  witty  women  world politics  writers  
0  False      False  Fals

In [25]:
import time

print("Mining frequent itemsets...")
start = time.time()

frequent_itemsets = apriori(
    df_onehot,
    min_support=0.01,     # ~138 movies
    max_len=3,
    use_colnames=True,
    verbose=1
)

elapsed = time.time() - start
print(f"‚úì Found {len(frequent_itemsets)} frequent itemsets in {elapsed:.2f}s")

print("\nGenerating association rules...")
start = time.time()

rules = association_rules(
    frequent_itemsets,
    metric="confidence",
    min_threshold=0.3  # Lower threshold to get more rules
)

elapsed = time.time() - start
print(f"‚úì Generated {len(rules)} rules in {elapsed:.2f}s")

print(f"\nRules by confidence range:")
print(f"  0.3-0.4: {len(rules[(rules['confidence'] >= 0.3) & (rules['confidence'] < 0.4)])}")
print(f"  0.4-0.5: {len(rules[(rules['confidence'] >= 0.4) & (rules['confidence'] < 0.5)])}")
print(f"  0.5-0.6: {len(rules[(rules['confidence'] >= 0.5) & (rules['confidence'] < 0.6)])}")
print(f"  0.6-0.7: {len(rules[(rules['confidence'] >= 0.6) & (rules['confidence'] < 0.7)])}")
print(f"  0.7+:    {len(rules[rules['confidence'] >= 0.7])}")

print(f"\nTop 10 rules by confidence:")
print(rules.nlargest(10, 'confidence')[['antecedents', 'consequents', 'confidence', 'support', 'lift']])

Mining frequent itemsets...
Processing 69081 combinations | Sampling itemset size 32
‚úì Found 824 frequent itemsets in 7.24s

Generating association rules...
‚úì Generated 916 rules in 0.01s

Rules by confidence range:
  0.3-0.4: 246
  0.4-0.5: 167
  0.5-0.6: 120
  0.6-0.7: 120
  0.7+:    263

Top 10 rules by confidence:
                        antecedents   consequents  confidence   support  \
384   (action packed, fight scenes)      (action)    1.000000  0.016798   
0                   (action packed)      (action)    0.997442  0.028238   
380          (chase, action packed)      (action)    0.996063  0.018319   
469  (computer animation, animated)   (animation)    0.993007  0.010282   
721            (teen, girlie movie)  (teen movie)    0.993007  0.010282   
463             (cartoon, animated)   (animation)    0.991667  0.017233   
38                       (animated)   (animation)    0.981873  0.023532   
594              (splatter, creepy)      (horror)    0.979310  0.010282   
8

## Objective 1: Binary Transactional Format

The continuous relevance scores (0-1) have been transformed into binary transactions:
- **Method**: Select top N tags per movie based on relevance threshold (0.4-1.0)
- **Result**: Each movie ‚Üí list of present tags (binary: present=1, absent=0)
- **One-hot encoding**: TransactionEncoder creates the binary matrix for Apriori

This enables pattern mining on semantic relationships rather than just popularity.

## Objective 2: High-Confidence & High-Lift Rule Analysis

Find semantic relationships between descriptive tags (e.g., "Tarantino-esque" ‚Üí "non-linear timeline")

In [26]:
# Filter for high-quality rules
high_confidence = rules[rules['confidence'] >= 0.7]
high_lift = rules[rules['lift'] >= 2.0]
quality_rules = rules[(rules['confidence'] >= 0.7) & (rules['lift'] >= 2.0)]

print(f"Rules with confidence >= 0.7: {len(high_confidence)}")
print(f"Rules with lift >= 2.0: {len(high_lift)}")
print(f"Rules with BOTH: {len(quality_rules)}")

# Show top semantic relationships by lift (surprising connections)
print("\n=== Top 10 Most Surprising Relationships (by Lift) ===")
top_lift = rules.nlargest(10, 'lift')[['antecedents', 'consequents', 'confidence', 'support', 'lift']]
for idx, row in top_lift.iterrows():
    ant = ', '.join(list(row['antecedents']))
    con = ', '.join(list(row['consequents']))
    print(f"\nIF [{ant}]")
    print(f"THEN [{con}]")
    print(f"  Confidence: {row['confidence']:.2%} | Lift: {row['lift']:.2f} | Support: {row['support']:.2%}")

# Show top by confidence (most reliable)
print("\n\n=== Top 10 Most Reliable Relationships (by Confidence) ===")
top_conf = rules.nlargest(10, 'confidence')[['antecedents', 'consequents', 'confidence', 'support', 'lift']]
for idx, row in top_conf.iterrows():
    ant = ', '.join(list(row['antecedents']))
    con = ', '.join(list(row['consequents']))
    print(f"\nIF [{ant}]")
    print(f"THEN [{con}]")
    print(f"  Confidence: {row['confidence']:.2%} | Lift: {row['lift']:.2f} | Support: {row['support']:.2%}")

Rules with confidence >= 0.7: 263
Rules with lift >= 2.0: 907
Rules with BOTH: 263

=== Top 10 Most Surprising Relationships (by Lift) ===

IF [french]
THEN [france]
  Confidence: 73.08% | Lift: 42.05 | Support: 1.24%

IF [france]
THEN [french]
  Confidence: 71.25% | Lift: 42.05 | Support: 1.24%

IF [biopic]
THEN [biography]
  Confidence: 56.84% | Lift: 36.86 | Support: 1.17%

IF [biography]
THEN [biopic]
  Confidence: 76.06% | Lift: 36.86 | Support: 1.17%

IF [gay]
THEN [gay character]
  Confidence: 56.88% | Lift: 35.54 | Support: 1.11%

IF [gay character]
THEN [gay]
  Confidence: 69.23% | Lift: 35.54 | Support: 1.11%

IF [us history]
THEN [history]
  Confidence: 87.77% | Lift: 35.34 | Support: 1.19%

IF [history]
THEN [us history]
  Confidence: 48.10% | Lift: 35.34 | Support: 1.19%

IF [cartoon, animals]
THEN [animated]
  Confidence: 84.00% | Lift: 35.05 | Support: 1.06%

IF [animated]
THEN [cartoon, animals]
  Confidence: 44.41% | Lift: 35.05 | Support: 1.06%


=== Top 10 Most Relia

## Objective 3: Recommendation Diversity Demonstration

Use association rules to recommend diverse films based on semantic trait linkage

In [27]:
def recommend_by_rules(movie_title, rules_df, transactions_df, top_n=10, min_confidence=0.6):
    """
    Recommend movies based on association rules from a seed movie's tags
    """
    # Get seed movie tags
    seed_movie = transactions_df[transactions_df['title'] == movie_title]
    if seed_movie.empty:
        print(f"Movie '{movie_title}' not found")
        return None
    
    seed_tags = set(seed_movie.iloc[0]['tags'])
    print(f"Seed movie: {movie_title}")
    print(f"Tags: {', '.join(list(seed_tags)[:10])}\n")
    
    # Filter rules by minimum confidence
    filtered_rules = rules_df[rules_df['confidence'] >= min_confidence]
    
    # Find rules where antecedents are in seed tags
    # Aggregate all rules for each tag (don't just keep the best)
    recommended_tags = {}  # tag -> {'lifts': [], 'confidences': [], 'count': n}
    rule_count = 0
    
    for _, rule in filtered_rules.iterrows():
        antecedents = set(rule['antecedents'])
        consequents = set(rule['consequents'])
        
        # If seed movie has the antecedent tags, recommend movies with consequent tags
        if antecedents.issubset(seed_tags):
            rule_count += 1
            for tag in consequents:
                if tag not in seed_tags:  # Don't recommend tags already present
                    if tag not in recommended_tags:
                        recommended_tags[tag] = {'lifts': [], 'confidences': [], 'count': 0}
                    recommended_tags[tag]['lifts'].append(rule['lift'])
                    recommended_tags[tag]['confidences'].append(rule['confidence'])
                    recommended_tags[tag]['count'] += 1
    
    print(f"Applied {rule_count} rules from seed tags")
    print(f"Discovered {len(recommended_tags)} new related tags\n")
    
    if len(recommended_tags) == 0:
        print("‚ö†Ô∏è  No rules found. Try lowering min_confidence or use more rules.")
        return []
    
    # Calculate aggregate scores for each tag
    tag_scores = {}
    for tag, stats in recommended_tags.items():
        avg_lift = sum(stats['lifts']) / len(stats['lifts'])
        avg_conf = sum(stats['confidences']) / len(stats['confidences'])
        # Combined score: average lift * rule count (tags appearing in more rules rank higher)
        tag_scores[tag] = (avg_lift * stats['count'], avg_conf, avg_lift, stats['count'])
    
    # Show top recommended tags
    top_rec_tags = sorted(tag_scores.items(), key=lambda x: x[1][0], reverse=True)[:10]
    print("Top recommended tags (by aggregated score):")
    for tag, (score, avg_conf, avg_lift, count) in top_rec_tags:
        print(f"  ‚Ä¢ {tag}: {count} rules, avg lift={avg_lift:.2f}, avg conf={avg_conf:.2f}")
    
    # Find movies with recommended tags (excluding seed movie)
    candidates = []
    recommended_tag_set = set(recommended_tags.keys())
    
    for _, movie in transactions_df.iterrows():
        if movie['title'] == movie_title:
            continue
        
        movie_tags = set(movie['tags'])
        overlap = recommended_tag_set.intersection(movie_tags)
        
        if overlap:
            # Score by sum of aggregated tag scores
            score = sum(tag_scores[tag][0] for tag in overlap)
            candidates.append({
                'title': movie['title'],
                'score': score,
                'matching_tags': overlap,
                'tag_count': len(overlap)
            })
    
    # Sort by score
    candidates.sort(key=lambda x: x['score'], reverse=True)
    
    print(f"\n=== Top {top_n} Diverse Recommendations ===")
    for i, rec in enumerate(candidates[:top_n], 1):
        print(f"\n{i}. {rec['title']}")
        print(f"   Score: {rec['score']:.2f} ({rec['tag_count']} matching tags)")
        print(f"   Via tags: {', '.join(list(rec['matching_tags'])[:5])}")
    
    return candidates[:top_n]


example_movie = "Pulp Fiction (1994)"  
recommendations = recommend_by_rules(example_movie, rules, df_transactions, top_n=10, min_confidence=0.3)

Seed movie: Pulp Fiction (1994)
Tags: masterpiece, gratuitous violence, dark humor, storytelling, violence, gangsters, stylish, violent, hit men, imdb top 250

Applied 11 rules from seed tags
Discovered 5 new related tags

Top recommended tags (by aggregated score):
  ‚Ä¢ brutality: 4 rules, avg lift=8.81, avg conf=0.45
  ‚Ä¢ gangster: 1 rules, avg lift=26.61, avg conf=0.63
  ‚Ä¢ crime: 1 rules, avg lift=9.36, avg conf=0.39
  ‚Ä¢ oscar (best directing): 1 rules, avg lift=8.47, avg conf=0.41
  ‚Ä¢ criterion: 1 rules, avg lift=4.95, avg conf=0.57

=== Top 10 Diverse Recommendations ===

1. Kiss of Death (1995)
   Score: 71.22 (3 matching tags)
   Via tags: brutality, crime, gangster

2. Truth or Consequences, N.M. (1997)
   Score: 71.22 (3 matching tags)
   Via tags: brutality, crime, gangster

3. General, The (1998)
   Score: 71.22 (3 matching tags)
   Via tags: brutality, crime, gangster

4. Chopper (2000)
   Score: 71.22 (3 matching tags)
   Via tags: brutality, crime, gangster

5. Ac

In [None]:
# Demonstrate diversity: Compare rule-based vs simple tag overlap
def simple_tag_overlap_recommendations(movie_title, transactions_df, top_n=10):
    """
    Baseline: Recommend by direct tag overlap (no rules)
    """
    seed_movie = transactions_df[transactions_df['title'] == movie_title]
    if seed_movie.empty:
        return None
    
    seed_tags = set(seed_movie.iloc[0]['tags'])
    
    candidates = []
    for _, movie in transactions_df.iterrows():
        if movie['title'] == movie_title:
            continue
        
        movie_tags = set(movie['tags'])
        overlap = seed_tags.intersection(movie_tags)
        
        if overlap:
            candidates.append({
                'title': movie['title'],
                'score': len(overlap),
                'matching_tags': overlap
            })
    
    candidates.sort(key=lambda x: x['score'], reverse=True)
    return candidates[:top_n]

# Compare approaches
print("=" * 80)
print("COMPARISON: Rule-Based vs Simple Overlap")
print("=" * 80)

test_movie = "Pulp Fiction (1994)"  # Choose a well-known movie

print("\n### BASELINE: Simple Tag Overlap ###")
baseline_recs = simple_tag_overlap_recommendations(test_movie, df_transactions, top_n=5)
if baseline_recs:
    print(f"Seed: {test_movie}")
    for i, rec in enumerate(baseline_recs, 1):
        print(f"{i}. {rec['title']} (overlap: {rec['score']})")

print("\n### RULE-BASED: Semantic Association Rules ###")
rule_recs = recommend_by_rules(test_movie, rules, df_transactions, top_n=5, min_confidence=0.6)

# Calculate diversity: unique movies between approaches
if baseline_recs and rule_recs:
    baseline_titles = {r['title'] for r in baseline_recs}
    rule_titles = {r['title'] for r in rule_recs}
    unique_rule = rule_titles - baseline_titles
    
    print(f"\nüéØ Diversity Gain: {len(unique_rule)}/{len(rule_recs)} recommendations are NEW (not in baseline)")
    if unique_rule:
        print(f"   Unique discoveries: {', '.join(list(unique_rule)[:3])}")

COMPARISON: Rule-Based vs Simple Overlap

### BASELINE: Simple Tag Overlap ###

### RULE-BASED: Semantic Association Rules ###
Movie 'The Matrix (1999)' not found
