In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import sparse
import seaborn as sns
import sys
import os
import math
import time
import random
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import warnings
import ast
warnings.filterwarnings("ignore")

"append parent directory so that we can import sibling module"

parent_dir = os.path.dirname(os.path.abspath('.'))
sys.path.append(parent_dir)

In [2]:
movies = pd.read_csv('../data/movies_w_description.csv')
movies.drop_duplicates('movieId', inplace=True)
movies['genres'] = movies['genres'].map(ast.literal_eval)
movies['keywords'] = movies['keywords'].map(ast.literal_eval)
movies.head(2)

Unnamed: 0,movieId,overview,genres,title,keywords
0,1,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",Toy Story,"[jealousy, toy, boy, friendship, friends, riva..."
1,2,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",Jumanji,"[board game, disappearance, based on children'..."


In [3]:
from sklearn.model_selection import train_test_split

In [4]:
data_tr, data_te = train_test_split(movies, test_size=0.2)
len(data_tr), len(data_te)

(33575, 8394)

In [6]:
data_tr = data_tr.reset_index().drop('index', 1)
data_tr.head()

Unnamed: 0,movieId,overview,genres,title,keywords
0,106168,"Set in 1960, the story follows the efforts of ...","[TV Movie, Drama, History, War]",The Man Who Captured Eichmann,[]
1,150882,Life and adventures of a few Polish immigrants...,[Comedy],Happy New York,[]
2,133828,"When a cop-killer goes free, a detective's sea...","[Thriller, Drama, Action]",Mistrial,[]
3,148404,The President is the story of a dictator of an...,[Drama],The President,[dictator]
4,119946,In the early days of the industrial revolution...,"[Drama, History, War]",The New Babylon,[soviet union]


In [7]:
data_tr.tail()

Unnamed: 0,movieId,overview,genres,title,keywords
33570,61352,When straight arrow FBI agent Roy Clayton head...,"[Drama, Action, Thriller, Crime]",Traitor,"[bomb, prison, alcohol, kidnapping, traitor, d..."
33571,97913,"Wreck-It Ralph is the 9-foot-tall, 643-pound v...","[Family, Animation, Comedy, Adventure]",Wreck-It Ralph,"[support group, product placement, bullying, r..."
33572,7408,Abbott and Costello's version of the famous fa...,"[Comedy, Family, Fantasy]",Jack and the Beanstalk,"[problem child, falling from height, black and..."
33573,101448,My Last Five Girlfriends traces the romantic j...,"[Romance, Comedy]",My Last Five Girlfriends,[independent film]
33574,161032,A young boy plants some strange seeds and they...,"[Thriller, Action, Drama]",The Grandmother,"[surreal, surrealism, dark]"


In [3]:
movies.shape

(41969, 5)

In [4]:
unique_sid = list()
with open('../data/processed/4_5_5_False_15000/unique_sid.txt', 'r') as f:
    for line in f:
        unique_sid.append(int(line.strip()))
        
len(unique_sid)

17493

In [5]:
movies = movies[movies['movieId'].isin(unique_sid)]
movies.shape

(17493, 5)

In [7]:
from torch.utils.data import DataLoader, random_split

In [8]:
train_data, val_data = random_split(movies, [15000, 2493])

In [9]:
train_data.shape, val_data.shape

AttributeError: 'Subset' object has no attribute 'shape'

In [12]:
movies.head(2)
movies.rename(columns={'overview': 'text'}, inplace=True)
movies.head(2)

Unnamed: 0,movieId,text,genres,title,keywords
0,1,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",Toy Story,"[jealousy, toy, boy, friendship, friends, riva..."
1,2,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",Jumanji,"[board game, disappearance, based on children'..."


In [13]:
genres_list = [x for sublist in movies.genres.tolist() for x in sublist]
genres = set(genres_list)
genre2id = dict((pid, i) for (i, pid) in enumerate(genres))
genre2id

{'Thriller': 0,
 'Foreign': 1,
 'Adventure': 2,
 'TV Movie': 3,
 'Mystery': 4,
 'Music': 5,
 'War': 6,
 'Horror': 7,
 'Science Fiction': 8,
 'Fantasy': 9,
 'Documentary': 10,
 'Comedy': 11,
 'Animation': 12,
 'Romance': 13,
 'Drama': 14,
 'Family': 15,
 'Western': 16,
 'Action': 17,
 'Crime': 18,
 'History': 19}

In [14]:
num_genre = len(genres)
num_genre

20

In [16]:
def gen(x, num_genre):
    tmp = [0.0]*num_genre
    for g in x:
        tmp[genre2id[g]] = 1.0
        
    return tmp

movies['label'] = movies['genres'].map(lambda x : gen(x, genre2id, num_genre))
movies.head(2)

Unnamed: 0,movieId,text,genres,title,keywords,label
0,1,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",Toy Story,"[jealousy, toy, boy, friendship, friends, riva...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,2,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",Jumanji,"[board game, disappearance, based on children'...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [17]:
movies.iloc[0].genres

['Animation', 'Comedy', 'Family']

In [18]:
movies.iloc[0].label

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [8]:
data = movies['overview']
data.head(2)

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
Name: overview, dtype: object

In [9]:
data.name = 'text'
data.head(2)

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
Name: text, dtype: object

In [10]:
data.to_csv('../data/mask_language_model/text.csv', index=False)

In [11]:
from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'sklearn'

In [28]:
index = list(range(data.shape[0]))

In [29]:
np.random.shuffle(index)

In [36]:
train_data = data.iloc[index[:-3000]]
train_data.shape

(14493,)

In [37]:
train_data.head(2)

15748    The Fighter, is a drama about boxer "Irish" Mi...
14494    Three bad girls (a down-and-out stripper, a dr...
Name: text, dtype: object

In [38]:
vad_data = data.iloc[index[-3000:]]
vad_data.shape

(3000,)

In [39]:
train_data.to_csv('../data/mask_language_model/train_data.csv', index=False)
vad_data.to_csv('../data/mask_language_model/vad_data.csv', index=False)

In [8]:
14493 + 3000

17493

In [10]:
17493*0.15

2623.95

In [1]:
from sklearn.metrics import classification_report

In [2]:
import torch

In [31]:
y_pred = torch.FloatTensor( [0.6, 0.1, 0.8, 0.9]).to('cuda')
y_pred

tensor([0.6000, 0.1000, 0.8000, 0.9000], device='cuda:0')

In [32]:
torch.mean(y_pred)

tensor(0.6000, device='cuda:0')

In [18]:
label = torch.FloatTensor([[0.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 0.0]]).to('cuda')
label

tensor([[0., 0., 1., 0.],
        [0., 0., 1., 0.]], device='cuda:0')

In [25]:
bi_pred = (y_pred > 0.5).float()
bi_pred

tensor([[0., 1., 1., 0.],
        [1., 0., 1., 1.]], device='cuda:0')

In [21]:
bi_pred += label
bi_pred

tensor([[0., 1., 2., 0.],
        [1., 0., 2., 1.]], device='cuda:0')

In [30]:
((bi_pred == 2).sum() + (bi_pred == 0).sum()).float() / (y_pred.shape[0] * y_pred.shape[1])

tensor(0.3750, device='cuda:0')

In [28]:
print(classification_report(label.cpu(), bi_pred.cpu(), digits=3))

              precision    recall  f1-score   support

           0      0.000     0.000     0.000         0
           1      0.000     0.000     0.000         0
           2      1.000     1.000     1.000         2
           3      0.000     0.000     0.000         0

   micro avg      0.400     1.000     0.571         2
   macro avg      0.250     0.250     0.250         2
weighted avg      1.000     1.000     1.000         2
 samples avg      0.417     1.000     0.583         2



  _warn_prf(average, modifier, msg_start, len(result))


## Load trained model and compute embedding

In [1]:
start = 17000
end = start + 1000

In [1]:
import os
import sys

"append parent directory so that we can import sibling module"

parent_dir = os.path.dirname(os.path.abspath('.'))
sys.path.append(parent_dir)

In [2]:
from data_loader.movie_loader import MovieDataset
from model.model import MovieEmbedding
from model.batch import TokenizersCollateFn
from model.metric import compute_accuracy
from utils.utils import plot_results

import numpy as np
import pandas as pd
import random
import argparse
import json
import os
import logging
import time
import ast
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup
import pickle
import torch
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

In [4]:
save_path = "../saved/movie_embedding_1"

In [5]:
config = json.load(open(os.path.join(save_path, "config/config.json")))
config

{'pretrained_model_name_or_path': 'roberta-base',
 'pretrained_dir_cache': 'data/pretrained_model/roberta-base',
 'pretrained_tokenizer_name_or_path': 'roberta-base',
 'tokenizer_dir_cache': 'data/tokenizer/roberta-base',
 'tokenizer_use_fast': 1,
 'sequence_max_length': 256,
 'padding': 'longest',
 'truncation': 'longest_first',
 'classifier_hidden_size': 512,
 'dropout': 0.2,
 'data_dir': 'data/movies_w_description.csv',
 'selected_id_dir': 'data/processed/4_5_5_False_15000/unique_sid.txt',
 'save_dir': 'saved/movie_embedding_1',
 'n_classes': 20,
 'device': 'cuda',
 'seed': 2021,
 'tr_batch_size': 16,
 'vad_batch_size': 16,
 'learning_rate': 8e-05,
 'l2_reg': 0,
 'epochs': 20,
 'warmup_steps': 500,
 'freeze_encoder_after_epochs': 2,
 'accumulate_grad_batches': 8,
 'msg': 'try freeze_encoder_after_steps + RReLU'}

In [6]:
config['tokenizer_dir_cache'] = '../data/tokenizer/roberta-base'
config['pretrained_tokenizer_name_or_path'] = 'roberta-base'

config['pretrained_model_name_or_path'] = 'roberta-base'
config['pretrained_dir_cache'] = '../data/pretrained_model/roberta-base'

In [7]:
movies = pd.read_csv("../" + config['data_dir'])
movies.drop_duplicates('movieId', inplace=True)
movies.rename(columns={'overview': 'text'}, inplace=True)
movies['genres'] = movies['genres'].map(ast.literal_eval)
movies['keywords'] = movies['keywords'].map(ast.literal_eval)
movies.shape

(41969, 5)

In [8]:
unique_sid = list()
with open("../" + config['selected_id_dir'], 'r') as f:
    for line in f:
        unique_sid.append(int(line.strip()))

In [9]:
mid = dict((i, s) for (s, i) in enumerate(unique_sid))

In [10]:
movies = movies[movies['movieId'].isin(unique_sid)]
movies.shape

(17493, 5)

In [11]:
movies['mid'] = movies['movieId'].map(mid)
movies.tail(2)

Unnamed: 0,movieId,text,genres,title,keywords,mid
41831,175795,When the pressure to be royal becomes too much...,"[TV Movie, Family, Action, Comedy, Music, Adve...",Descendants 2,"[fairy tale, villain, musical, teen movie, tee...",17185
41977,176211,A closeted boy runs the risk of being outed by...,"[Family, Animation, Romance, Comedy]",In a Heartbeat,"[love, teenager, lgbt, short]",11276


In [12]:
movies.sort_values('mid', inplace=True)
movies = movies.reset_index().drop('index', 1)
movies.tail(2)

Unnamed: 0,movieId,text,genres,title,keywords,mid
17491,69187,A remake of the hugely successful Malayalam fi...,"[Comedy, Drama]",Billu,"[barbershop, actors, remake of malayalam film]",17491
17492,135759,A historical drama set in the Koryo dynasty an...,[Drama],A Frozen Flower,"[adultery, sword fight, romance, betrayal, tra...",17492


In [13]:
tokenizer = TokenizersCollateFn(config)

In [14]:
model = MovieEmbedding(config)

model.load_state_dict(torch.load(os.path.join(save_path, "model/model.pt")))

<All keys matched successfully>

In [15]:
df = movies[start:end]
df.shape

(493, 6)

In [16]:
embs = []

for index, row in df.iterrows():
    text = row['text']
    encoded = tokenizer.tokenizer.encode_plus(text, 
                                              add_special_tokens=True, 
                                              return_tensors='pt', 
                                              return_attention_mask=True)

    out = model.encoder(encoded.input_ids, encoded.attention_mask)
    embs.append(out[0][:, 0, :].tolist())

In [17]:
name = save_path + "/embs/" + str(start) + ".pck"
with open(name, 'wb') as fp:
    pickle.dump(embs, fp)

In [18]:
# with open (name, 'rb') as fp:
#     hehe = pickle.load(fp)

In [78]:
embs = []

In [116]:
start = embs.shape[0]
name = save_path + "/embs/" + str(start) + ".pck"
with open (name, 'rb') as fp:
    hehe = np.array(pickle.load(fp))

In [117]:
embs = np.concatenate((embs, hehe.reshape(hehe.shape[0], 768)))
embs.shape

(17493, 768)

In [118]:
name = save_path + "/embs/embedding.pck"
with open(name, 'wb') as fp:
    pickle.dump(embs, fp)

In [131]:
name

'../saved/movie_embedding_1/embs/embedding.pck'

In [134]:
with open (name, 'rb') as fp:
    hehe = torch.tensor(pickle.load(fp))

In [135]:
hehe.shape

torch.Size([17493, 768])

In [121]:
hehe[0][:10]

array([ 0.33821532, -1.37877941,  0.34697649, -0.93654454, -0.07133052,
       -1.56192374, -0.08415685, -0.55216378,  0.861265  ,  0.15836471])

In [122]:
haha = torch.from_numpy(hehe)
haha[0][:10]

tensor([ 0.3382, -1.3788,  0.3470, -0.9365, -0.0713, -1.5619, -0.0842, -0.5522,
         0.8613,  0.1584], dtype=torch.float64)

In [123]:
from torch import nn

In [128]:
embs = nn.Embedding(17493, 768)

In [126]:
hehe.shape

(17493, 768)

In [130]:
embs.weight.data = torch.from_numpy(hehe)

# Test movie embedding

In [2]:
movies = pd.read_csv('../data/movies_w_description.csv')
movies.drop_duplicates('movieId', inplace=True)
movies['genres'] = movies['genres'].map(ast.literal_eval)
movies['keywords'] = movies['keywords'].map(ast.literal_eval)
movies.head(2)

Unnamed: 0,movieId,overview,genres,title,keywords
0,1,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",Toy Story,"[jealousy, toy, boy, friendship, friends, riva..."
1,2,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",Jumanji,"[board game, disappearance, based on children'..."


In [3]:
unique_sid = list()
with open('../data/processed/4_5_5_False_15000/unique_sid.txt', 'r') as f:
    for line in f:
        unique_sid.append(int(line.strip()))
        
len(unique_sid)

17493

In [4]:
movies = movies[movies['movieId'].isin(unique_sid)]
movies.shape

(17493, 5)

In [5]:
mid = dict((i, s) for (s, i) in enumerate(unique_sid))

In [6]:
movies['mid'] = movies['movieId'].map(mid)
movies.sort_values('mid', inplace=True)
movies = movies.reset_index().drop('index', 1)
movies.tail(2)

Unnamed: 0,movieId,overview,genres,title,keywords,mid
17491,69187,A remake of the hugely successful Malayalam fi...,"[Comedy, Drama]",Billu,"[barbershop, actors, remake of malayalam film]",17491
17492,135759,A historical drama set in the Koryo dynasty an...,[Drama],A Frozen Flower,"[adultery, sword fight, romance, betrayal, tra...",17492


In [7]:
movies.head(2)

Unnamed: 0,movieId,overview,genres,title,keywords,mid
0,147,Film adaptation of street tough Jim Carroll's ...,"[Drama, Crime]",The Basketball Diaries,"[sport, basketball, addiction, friends, drug]",0
1,858,"Spanning the years 1945 to 1955, a chronicle o...","[Drama, Crime]",The Godfather,"[italy, love at first sight, loss of father, p...",1


In [21]:
with open("../saved/movie_embedding_1/embs/embedding.pck", 'rb') as fp:
    weights = pickle.load(fp)
    
weights = [x.tolist() for x in weights] 

In [22]:
movies['embe'] = weights

In [23]:
movies.head(2)

Unnamed: 0,movieId,overview,genres,title,keywords,mid,embe
0,147,Film adaptation of street tough Jim Carroll's ...,"[Drama, Crime]",The Basketball Diaries,"[sport, basketball, addiction, friends, drug]",0,"[0.33821532130241394, -1.378779411315918, 0.34..."
1,858,"Spanning the years 1945 to 1955, a chronicle o...","[Drama, Crime]",The Godfather,"[italy, love at first sight, loss of father, p...",1,"[0.4388054609298706, -1.2216815948486328, -0.0..."


In [26]:
def flatten(lol):
    return [s for a in lol for s in a]

def find_similar_movie(movieId, k=20):
    print('Similar movies to {}:'.format(movies[movies['mid']==movieId]['title'].values))
    print('')
    new_id_emb = movies[movies['mid']==movieId]['embe'].values.tolist()
    item_scores = cosine_similarity(weights, new_id_emb)
    item_preds = flatten((-item_scores).argsort(axis=0))
    for i in item_preds[:k]:
        print(movies.iloc[i]['title'], item_scores[i], movies.iloc[i]['genres'])

In [35]:
test_id = 15

find_similar_movie(test_id)

Similar movies to ['Harry Potter and the Half-Blood Prince']:

Harry Potter and the Half-Blood Prince [1.] ['Adventure', 'Fantasy', 'Family']
Harry Potter and the Order of the Phoenix [0.96065018] ['Adventure', 'Fantasy', 'Family', 'Mystery']
Harry Potter and the Deathly Hallows: Part 1 [0.9467894] ['Adventure', 'Fantasy', 'Family']
Harry Potter and the Goblet of Fire [0.92461667] ['Adventure', 'Fantasy', 'Family']
Hansel & Gretel: Witch Hunters [0.91186996] ['Fantasy', 'Horror', 'Action']
Harry Potter and the Prisoner of Azkaban [0.90970572] ['Adventure', 'Fantasy', 'Family']
Tales from Earthsea [0.90525908] ['Adventure', 'Fantasy', 'Animation', 'Science Fiction']
Leprechaun [0.90328595] ['Comedy', 'Horror', 'Thriller']
Harry Potter and the Philosopher's Stone [0.90239072] ['Adventure', 'Fantasy', 'Family']
Dragon Ball: Sleeping Princess in Devil's Castle [0.90127954] ['Action', 'Animation']
Halloweentown II: Kalabar's Revenge [0.90045283] ['Adventure', 'Family', 'Fantasy', 'TV Movie'

In [36]:
movies[movies['mid']==test_id]['overview'].values

array(["As Harry begins his sixth year at Hogwarts, he discovers an old book marked as 'Property of the Half-Blood Prince', and begins to learn more about Lord Voldemort's dark past."],
      dtype=object)

In [37]:
movies[movies['title']=='Harry Potter and the Order of the Phoenix']['overview'].values

      dtype=object)

In [34]:
movies.head(40).title

0                           The Basketball Diaries
1                                    The Godfather
2                           The Godfather: Part II
3                               Dead Poets Society
4                               The Breakfast Club
5                                  The Sixth Sense
6                         Ferris Bueller's Day Off
7                                       Fight Club
8                                          Memento
9                                     Donnie Darko
10                                  Igby Goes Down
11                                   Batman Begins
12                                 The Dark Knight
13                                        Iron Man
14                                       Star Trek
15          Harry Potter and the Half-Blood Prince
16                                 Sherlock Holmes
17    Harry Potter and the Deathly Hallows: Part 1
18              Sherlock Holmes: A Game of Shadows
19                           Th