In [2]:
%load_ext autoreload

In [3]:
%autoreload 2

# Imports

In [336]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import torch.optim as optim
from os import listdir
from os.path import isfile, join
from tracin.tracin import (
    save_tracin_checkpoint,
    load_tracin_checkpoint,
    calculate_tracin_influence,
)
import pandas as pd
from LSTM_clean.utils import filter_and_split_data, sequence_generator, load_community_dict, get_communities, shannon_index, gini_simpson_index, num_unique, average_diversity

from LSTM_clean import utils
from LSTM_clean.model import LSTM 
from collections import Counter
import numpy as np
import pickle
from collections import defaultdict
import copy
from torch.optim import SGD
from lstm_wrapper import train_model, get_topk_predictions, train_model2
import matplotlib.pyplot as plt
from copy import deepcopy
import math
import seaborn as sns
import random
import sys
sns.set(style='darkgrid')

# Analysis Tools

In [196]:
def plot_counts(counts: dict[str, int], title='placholder_title', xlabel='x', ylabel='count') -> plt.Figure:
    """Plots a histogram from dict"""
    fig = plt.figure()

    # Creating the barplot
    plt.bar(counts.keys(), counts.values())
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)

def line_plot(hm):
    # line plot
    fig = plt.figure()
    ax = plt.axes()
    ax.plot(sorted(hm.keys()), [hm[x] for x in sorted(hm.keys())])
    
    
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

# Loading data

In [197]:
# Data Location
# Use SAVE_PREFIX to mark different datasets
SAVE_PREFIX = ""
SAVE_FOLDER = os.path.join(os.getcwd(), "data/twitch_sequence/")
SAVE_TRAIN_NAME = SAVE_PREFIX + "train.data"
SAVE_VALID_NAME = SAVE_PREFIX + "valid.data"
SAVE_TEST_NAME = SAVE_PREFIX + "test.data"
SAVE_COMMUNITY_NAME = SAVE_PREFIX + "lstm_idx_communities.data"

In [287]:
train_data = np.load(os.path.join(SAVE_FOLDER, SAVE_TRAIN_NAME), allow_pickle=True)
valid_data = np.load(os.path.join(SAVE_FOLDER, SAVE_VALID_NAME), allow_pickle=True)
test_data = np.load(os.path.join(SAVE_FOLDER, SAVE_TEST_NAME), allow_pickle=True)

# Experiment Prep

In [288]:
community_dict = load_community_dict(os.path.join(SAVE_FOLDER, SAVE_COMMUNITY_NAME))

In [289]:
# Generate sequences and gt
sequences = [pt[0] for pt in train_data]
gt = [pt[1] for pt in train_data]

# Experiment #1 - Remove Filter Bubble Points

In [290]:
def is_filter_bubble(sequence, label):
    communities = get_communities(sequence, community_dict)
    hm_communities = Counter(communities)
    for k, v in hm_communities.items():
        if v >= 0.95 * len(communities) and k == community_dict[label]:
            return 1.0
    return 0.0

In [291]:
train_data_exp1 = []

for i, sequence in enumerate(sequences):
    if (len(get_communities(sequence, community_dict)) <= 30) or not (is_filter_bubble(sequence, gt[i])):
        train_data_exp1.append(deepcopy([sequence, gt[i]]))

In [292]:
1 - len(train_data_exp1) / len(train_data)

0.12626630233751357

In [293]:
len(train_data) - len(train_data_exp1)

10582

# Experiment #2 - Remove a random 12.5% of training data

In [294]:
idx_set = set(np.random.choice(len(train_data), size=10582, replace=False))

In [295]:
train_data_exp2 = [row for i, row in enumerate(train_data) if i not in idx_set]

In [296]:
len(train_data_exp2)

73225

# Experiment #3 - Augment with Breaking Bubble Points (x10)

In [297]:
# Checks label is a brand new community
def is_breaking_bubble(sequence, label):
    communities = get_communities(sequence, community_dict)
    hm_communities = Counter(communities)
    for k, v in hm_communities.items():
        if v >= 0.8 * len(communities) and community_dict[label] not in communities:
            return 1.0
    return 0.0

In [298]:
# Breaking bubble
nums = []
train_pts_breaking_bubble = []
for i, sequence in enumerate(sequences):
    if len(get_communities(sequence, community_dict)) < 8:
        continue
    nums.append(is_breaking_bubble(sequence, gt[i]))
    if nums[-1] == 1:
        train_pts_breaking_bubble.append([sequence, gt[i]])
len(train_pts_breaking_bubble)

819

## x10

In [299]:
train_data_exp3 = deepcopy(train_data)
for i in range(10):
    train_data_exp3 += deepcopy(train_pts_breaking_bubble)

In [300]:
len(train_data_exp3)

91997

# Experiment #4 - Augment with random selection of points

In [301]:
idx_list = np.random.choice(len(train_data), size=819)
addtl_data = []
for val in idx_list:
    addtl_data.append(deepcopy(train_data[val]))

## x10

In [302]:
train_data_exp4 = deepcopy(train_data)
for i in range(10):
    train_data_exp4 += deepcopy(addtl_data)

In [303]:
assert len(train_data_exp3) == len(train_data_exp4)

In [304]:
len(train_data_exp4)

91997

# Experiment #5 - Remove Filter Bubble Points + Augment with Breaking Bubble Points

In [305]:
train_data_exp5 = []

# Remove filter bubble
for i, sequence in enumerate(sequences):
    if (len(get_communities(sequence, community_dict)) <= 30) or not (is_filter_bubble(sequence, gt[i])):
        train_data_exp5.append([sequence, gt[i]])

In [306]:
# Breaking bubble
nums = []
augment_breaking_bubble = []
for i, sequence in enumerate(sequences):
    if len(get_communities(sequence, community_dict)) < 8:
        continue
    nums.append(is_breaking_bubble(sequence, gt[i]))
    if nums[-1] == 1:
        augment_breaking_bubble.append([sequence, gt[i]])

In [307]:
# Putting it together
for i in range(10):
    train_data_exp5 += deepcopy(augment_breaking_bubble)

# MODEL TRAINING

In [103]:
EPOCHS = 200
OUTPUT_SIZE = 1743

In [None]:
model_exp1 = train_model2(
    train_data_exp1,
    valid_data,
    epochs=EPOCHS,
    output_size=OUTPUT_SIZE,
    checkpoints_folder_name='checkpoints_exp1',
)

Device is cuda
Train: 67307, Valid: 10464
Model is  LSTM(
  (item_emb): Embedding(1743, 128, padding_idx=0)
  (LSTM): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1743, bias=True)
)

Training and testing
train # = 67307, test # = 10464

Epoch 0	Train Loss: 0.014642574947001875	Test MRR: 0.00794307617083256	Test Recall@10: 0.011372324159021407	Elapsed time: 1.3929903507232666
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp1/0.pt
Epoch 5	Train Loss: 0.014374292675939126	Test MRR: 0.049587439012427494	Test Recall@10: 0.10311544342507645	Elapsed time: 6.133145093917847
Epoch 10	Train Loss: 0.013792065473022034	Test MRR: 0.05292938096440965	Test Recall@10: 0.1180237003058104	Elapsed time: 6.514334678649902
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp1/10.pt
Epoch 15	Train Loss: 0.013198858752178338	Test MRR: 0.06017428369712951	Test Re

In [None]:
model_exp2 = train_model2(
    train_data_exp2,
    valid_data,
    epochs=EPOCHS,
    output_size=OUTPUT_SIZE,
    checkpoints_folder_name='checkpoints_exp2',
)

Device is cuda
Train: 67307, Valid: 10464
Model is  LSTM(
  (item_emb): Embedding(1743, 128, padding_idx=0)
  (LSTM): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1743, bias=True)
)

Training and testing
train # = 67307, test # = 10464

Epoch 0	Train Loss: 0.01463501072996859	Test MRR: 0.007797949159827661	Test Recall@10: 0.012710244648318042	Elapsed time: 1.4163739681243896
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp1/0.pt
Epoch 5	Train Loss: 0.014391997094753495	Test MRR: 0.04570655887111048	Test Recall@10: 0.09537461773700306	Elapsed time: 6.468673467636108
Epoch 10	Train Loss: 0.013833914789689539	Test MRR: 0.050879581103864216	Test Recall@10: 0.10904051987767584	Elapsed time: 5.817203521728516
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp1/10.pt
Epoch 15	Train Loss: 0.013248563232562433	Test MRR: 0.05902298817970654	Test R

In [178]:
model_exp3 = train_model2(
    train_data_exp3,
    valid_data,
    epochs=EPOCHS,
    output_size=OUTPUT_SIZE,
    checkpoints_folder_name='checkpoints_exp3',
)

Device is cuda
Train: 91997, Valid: 10464
Model is  LSTM(
  (item_emb): Embedding(1743, 128, padding_idx=0)
  (LSTM): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1743, bias=True)
)

Training and testing
train # = 91997, test # = 10464

Epoch 0	Train Loss: 0.014578206653935609	Test MRR: 0.01404339200359645	Test Recall@10: 0.02570718654434251	Elapsed time: 1.8234646320343018
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp3/0.pt
Epoch 5	Train Loss: 0.014182381525399074	Test MRR: 0.0491101980748597	Test Recall@10: 0.10712920489296636	Elapsed time: 8.246484994888306
Epoch 10	Train Loss: 0.01334685893771827	Test MRR: 0.05128082649508877	Test Recall@10: 0.10693807339449542	Elapsed time: 8.829949617385864
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp3/10.pt
Epoch 15	Train Loss: 0.012929109832129417	Test MRR: 0.058167599095286064	Test Reca

In [179]:
model_exp4 = train_model2(
    train_data_exp4,
    valid_data,
    epochs=EPOCHS,
    output_size=OUTPUT_SIZE,
    checkpoints_folder_name='checkpoints_exp4',
)

Device is cuda
Train: 91997, Valid: 10464
Model is  LSTM(
  (item_emb): Embedding(1743, 128, padding_idx=0)
  (LSTM): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1743, bias=True)
)

Training and testing
train # = 91997, test # = 10464

Epoch 0	Train Loss: 0.014600723027180037	Test MRR: 0.006168047508904193	Test Recall@10: 0.008983180428134556	Elapsed time: 1.8328452110290527
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp4/0.pt
Epoch 5	Train Loss: 0.014209012676208204	Test MRR: 0.04920008655442122	Test Recall@10: 0.11142966360856268	Elapsed time: 8.445865392684937
Epoch 10	Train Loss: 0.013389912196052725	Test MRR: 0.05525328660619977	Test Recall@10: 0.1162079510703364	Elapsed time: 9.025402545928955
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp4/10.pt
Epoch 15	Train Loss: 0.012821518613049296	Test MRR: 0.0754050587558352	Test Rec

In [180]:
model_exp5 = train_model2(
    train_data_exp5,
    valid_data,
    epochs=EPOCHS,
    output_size=OUTPUT_SIZE,
    checkpoints_folder_name='checkpoints_exp5',
)

Device is cuda
Train: 81415, Valid: 10464
Model is  LSTM(
  (item_emb): Embedding(1743, 128, padding_idx=0)
  (LSTM): LSTM(128, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=1743, bias=True)
)

Training and testing
train # = 81415, test # = 10464

Epoch 0	Train Loss: 0.01466070652037341	Test MRR: 0.010603990195566897	Test Recall@10: 0.01930428134556575	Elapsed time: 1.7866673469543457
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp5/0.pt
Epoch 5	Train Loss: 0.014225517490356273	Test MRR: 0.04874048679926645	Test Recall@10: 0.10607798165137615	Elapsed time: 6.662287712097168
Epoch 10	Train Loss: 0.013349653305047152	Test MRR: 0.050466908222527744	Test Recall@10: 0.10196865443425077	Elapsed time: 6.513912200927734
saving checkpoint to /raid/home/myang349/mitigating-filter-bubbles-final/recsys-filterbubbles/checkpoints_exp5/10.pt
Epoch 15	Train Loss: 0.012998475221951236	Test MRR: 0.05462736083650436	Test Re

# Evaluating Recall and Diversity of All Models

## Loading baseline

In [184]:
# Parameter may need to change if you re-ran all the prior pre-processing experiments
OUTPUT_SIZE=1743

device = torch.device("cuda")
print("Device is", device)
m0 = LSTM(input_size=128, output_size=OUTPUT_SIZE, hidden_dim=64, n_layers=1, device=device) 
m0.LSTM.flatten_parameters()
optimizer = optim.SGD(m0.parameters(), lr=5e-2, momentum=0.9)
m0.to(device)
m0, _, epoch, loss = load_tracin_checkpoint(m0, optimizer, os.path.join(os.getcwd(), "checkpoints/lstm_checkpoint_epoch600.pt"))

Device is cuda


## Loading Others

In [185]:
m1 = model_exp1[0]
m2 = model_exp2[0]
m3 = model_exp3[0]
m4 = model_exp4[0]
m5 = model_exp5[0]
models = [m0, m1, m2, m3, m4, m5]

## Performance Metrics

In [331]:
def recall(predictions, sequences, gt, k=10):
    assert len(sequences) == len(predictions)
    hit = 0
    total = 0
    for i, val in enumerate(gt):
        try:
            assert len(predictions[i]) >= k
        except:
            print(len(predictions), len(gt), i, k)
            raise('lol')
        if val in predictions[i][:k]:
            hit += 1
        total += 1  
    # should be around .505
    return (hit/total)

def MRR(predictions, sequences, gt):
    assert len(sequences) == len(predictions)
    sum_rr = 0
    total = 0
    for i, val in enumerate(gt):
        if val in predictions[i]:
            sum_rr += 1 / (1 + predictions[i].index(val))
        total += 1
    return (sum_rr/total)

## On Validation Dataset

In [211]:
# Generate sequences and gt
sequences = [pt[0] for pt in valid_data]
gt = [pt[1] for pt in valid_data]

In [217]:
for i, model in enumerate(models):
    predictions = get_topk_predictions(model, sequences, k=10)
    diversity_scores = average_diversity(predictions, community_dict, diversity_metric=gini_simpson_index, k=10)
    print(f"\nModel {i}")
    print(f"Recall@10: {recall(predictions, sequences, gt, 10)}")
    print(f"MRR: {MRR(predictions, sequences, gt)}")
    print(f"Average Gini-Simpson Index: {sum(diversity_scores)/len(diversity_scores)}")


Model 0
Recall@10: 0.4969418960244648
MRR: 0.22120315761370749
Average Gini-Simpson Index: 0.16448967889908378

Model 1
Recall@10: 0.45670871559633025
MRR: 0.2093223250691707
Average Gini-Simpson Index: 0.1736792813455677

Model 2
Recall@10: 0.465118501529052
MRR: 0.2092364675865241
Average Gini-Simpson Index: 0.15562691131498574

Model 3
Recall@10: 0.44801223241590216
MRR: 0.19728934578418406
Average Gini-Simpson Index: 0.2699216360856218

Model 4
Recall@10: 0.4625382262996942
MRR: 0.2047840138221433
Average Gini-Simpson Index: 0.1619495412844053

Model 5
Recall@10: 0.41771788990825687
MRR: 0.18098624611669176
Average Gini-Simpson Index: 0.324461009174305


# On Test Dataset

In [221]:
# Generate sequences and gt
sequences = [pt[0] for pt in test_data]
gt = [pt[1] for pt in test_data]

In [222]:
for i, model in enumerate(models):
    predictions = get_topk_predictions(model, sequences, k=10)
    diversity_scores = average_diversity(predictions, community_dict, diversity_metric=gini_simpson_index, k=10)
    print(f"\nModel {i}")
    print(f"Recall@10: {recall(predictions, sequences, gt, 10)}")
    print(f"MRR: {MRR(predictions, sequences, gt)}")
    print(f"Average Gini-Simpson Index: {sum(diversity_scores)/len(diversity_scores)}")


Model 0
Recall@10: 0.5000871535645808
MRR: 0.22734512258217984
Average Gini-Simpson Index: 0.16546975771309233

Model 1
Recall@10: 0.4730695485445355
MRR: 0.21869551172976312
Average Gini-Simpson Index: 0.17008366742200012

Model 2
Recall@10: 0.4696705595258846
MRR: 0.2156470045458185
Average Gini-Simpson Index: 0.15761896461565386

Model 3
Recall@10: 0.45633606414502353
MRR: 0.20196015975386666
Average Gini-Simpson Index: 0.27464005577827433

Model 4
Recall@10: 0.47219801289872754
MRR: 0.2109810032897005
Average Gini-Simpson Index: 0.1628656092034182

Model 5
Recall@10: 0.4308000697228517
MRR: 0.18594026245673329
Average Gini-Simpson Index: 0.33233048631687884


# Run Massive Experiment — 10 iterations each

In [317]:
community_dict = load_community_dict(os.path.join(SAVE_FOLDER, SAVE_COMMUNITY_NAME))

In [334]:
all_valid_recall = defaultdict(list)
all_valid_mrr = defaultdict(list)
all_valid_gini_simpson = defaultdict(list)

all_test_recall = defaultdict(list)
all_test_mrr = defaultdict(list)
all_test_gini_simpson = defaultdict(list)

## We use this to check that our data doesn't get severely messed up over time

In [319]:
datasets = [train_data, train_data_exp1, train_data_exp2, train_data_exp3, train_data_exp4, train_data_exp5]

# These are the correct sizes (shouldn't change) of our experiment datasets
correct_sizes = {
    0: 83807,
    1: 73225,
    2: 73225,
    3: 91997,
    4: 91997,
    5: 81415,
}

In [None]:
for _ in range(10):   
    for i, data in enumerate(datasets):
        # Shuffle data
        np.random.shuffle(data)
        assert correct_sizes[i] == len(data)
        
        # Train Model
        with HiddenPrints():
            model = train_model2(
                data,
                valid_data,
                epochs=200,
                output_size=OUTPUT_SIZE,
                checkpoints_folder_name='temp_checkpoints',
            )[0]

        # Prepare valid/test data
        valid_sequences = [pt[0] for pt in valid_data]
        valid_gt = [pt[1] for pt in valid_data]
        test_sequences = [pt[0] for pt in test_data]
        test_gt = [pt[1] for pt in test_data]
        
        # Compute statistics on valid
        predictions = get_topk_predictions(model, valid_sequences, k=10)
        diversity_scores = average_diversity(predictions, community_dict, diversity_metric=gini_simpson_index, k=10)
        all_valid_recall[i].append(recall(predictions, valid_sequences, valid_gt, 10))
        all_valid_mrr[i].append(MRR(predictions, valid_sequences, valid_gt))
        all_valid_gini_simpson[i].append(sum(diversity_scores)/len(diversity_scores))
                                
        # Compute statistics on test
        predictions = get_topk_predictions(model, test_sequences, k=10)
        diversity_scores = average_diversity(predictions, community_dict, diversity_metric=gini_simpson_index, k=10)
        all_test_recall[i].append(recall(predictions, test_sequences, test_gt, 10))
        all_test_mrr[i].append(MRR(predictions, test_sequences, test_gt))
        all_test_gini_simpson[i].append(sum(diversity_scores)/len(diversity_scores))
        
        print(f"Finished Iteration {_}/10, for Model {i}")
        print(f"Validation Recall@10: {all_valid_recall[i][-1]}")

Finished Iteration 0/10, for Model 0
Validation Recall@10: 0.4644495412844037
Finished Iteration 0/10, for Model 1
Validation Recall@10: 0.4496368501529052
Finished Iteration 0/10, for Model 2
Validation Recall@10: 0.45451070336391436
Finished Iteration 0/10, for Model 3
Validation Recall@10: 0.4518348623853211
Finished Iteration 0/10, for Model 4
Validation Recall@10: 0.4617737003058104
Finished Iteration 0/10, for Model 5
Validation Recall@10: 0.4243119266055046
Finished Iteration 1/10, for Model 0
Validation Recall@10: 0.472381498470948


In [None]:
with open('results_on_valid.pickle', 'wb') as f:
    pickle.dump([all_valid_recall, all_valid_mrr, all_valid_gini_simpson], f)
    
with open('results_on_test.pickle', 'wb') as f:
    pickle.dump([all_test_recall, all_test_mrr, all_test_gini_simpson], f)