In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import itertools
from imdbUtils import *
import csv
pd.options.display.max_colwidth=500
from tqdm import tqdm

# Create Movie Reveiws Dataset

In [2]:
import pickle
df = pickle.load(open('data/imdb/imdb_reviews_1000film.df','rb'))

In [3]:
import datetime

movies_dic = {}
with open('data/imdb/imdb_movielens.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    next(csv_reader)
    for row in csv_reader:
        if row[12].replace('_',' ') not in movies_dic: 
            movies_dic[row[12].replace('_',' ')] = []
            
        temp = [r.replace('_',' ') for r in row[0:10]]
        
        month,year = '',''
        if len(row[10]) > 0:        
            month = datetime.date(1900, int(row[10][4::]), 1).strftime('%B')
            year = row[10][0:4]
        
        temp.append(month.lower() + ' ' + year)
        temp.append(int(float(row[14])))
        
        movies_dic[row[12].replace('_',' ')].append(temp)
        

# Create Graph

In [4]:
import utils
import graphUtils
import networkx as nx

In [5]:
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize
import nltk
ps = nltk.stem.PorterStemmer()

def return_n_grams(text,k):
    tokens = word_tokenize(text)
    n_grams = set()
    for i in range(0,len(tokens)-(k-1)):
        n_grams.add( ' '.join( ( [tk for tk in tokens[i:i+k]]) ))
        
    return n_grams


def find_all_n_grams (text,n):
    n_grams = []
    for k in range(1,n+1):
        k_grams = return_n_grams(text,k)
        for g in k_grams: n_grams.append(g)
    return n_grams


In [6]:
import nltk
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [7]:
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
from itertools import chain
from nltk.corpus import wordnet


G=nx.Graph()
K = 3

i = 0
row_ids = {}
id_rows = {}

for movie in tqdm(movies_dic):
    i+=1
    row_name = str('RW'+str(i))
    G.add_node(row_name , label= row_name, type='Row')
    row_ids[row_name] = movie
    id_rows[movie] = row_name
    j=0
    
    cols = [movie]
    for c in movies_dic[movie]: cols.append(c)
    for cl in cols:
    
#    for cl in movies_dic[movie]:
        j+=1
        col_name = str('CL'+str(i))
        if cl == '': continue
        if not G.has_node(col_name):     G.add_node(col_name , label= col_name, type='Column')
        n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(str(cl),K)]
        
        
        for tg in n_grams:
            
            if not G.has_node(tg):
                G.add_node(tg,label=tg, type='Token')
            G.add_edge(row_name,tg)
            G.add_edge(col_name,token_name)

100%|██████████| 42206/42206 [00:38<00:00, 1104.09it/s]


In [8]:
i = 0
review_ids = {}
id_review = {}

for row in tqdm(df.itertuples()):
    if row.movie.lower() not in movies_dic: continue
    i += 1
    text = remove_stopwords((row.user_review.lower()))
    review_name = str('Review'+str(i))
    G.add_node(review_name , label= review_name, type='Review')
    review_ids[review_name] = row.user_review
    id_review[text] = review_name
    
    n_grams = [gr.replace(' ','_') for gr in find_all_n_grams(text,K)]

    for tg in n_grams:
        
        
        if not G.has_node(tg):
            continue
            
        if not G.has_edge(review_name,tg):            G.add_edge(review_name,tg)
    

2000it [00:09, 213.95it/s]


In [9]:
ground_truth = {}
for row in tqdm(df.itertuples()):
    if row.movie.lower() not in movies_dic:         continue
    movie_name = row.movie.lower() 
    if movie_name not in ground_truth: ground_truth[movie_name] = []
    #if remove_stopwords(row.user_review.lower()) not in id_review: continue
    ground_truth[movie_name]. append(id_review[remove_stopwords((row.user_review.lower()))])

2000it [00:00, 11880.90it/s]


# RandomWalk

In [37]:
import random
def random_walk(node,l):
    res = ''
    
    p = 0
    chosen = node
    
    res += str(chosen)

    while (p<l):
        chosen = random.sample([n for n in nx.neighbors(G,chosen)],1)[0]
        if G.nodes[chosen]['type'] in ['Review','Row', 'Column', 'Token']:
            res += ' ' + str(chosen)
        p+=1
        
    return res



def generate_random_walks(k,l):
    rws = []
    
    for i in tqdm(range(0,k),position=0):
        for node in G.nodes():
            if len([n for n in nx.neighbors(G,node)]) == 0:
                continue
            if G.nodes[node]['type'] in ['Review','Row']:#,'Column']:
                rws.append(random_walk(node,l))
    return rws


In [75]:
docs = []
random_paths = generate_random_walks(200,l=30)
for p in random_paths:
    docs.append(p)

100%|██████████| 20/20 [3:22:54<00:00, 608.75s/it]  


# Word Embeddings

In [10]:
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from nltk.tokenize import word_tokenize

In [76]:
from gensim.parsing.preprocessing import remove_stopwords
from tqdm import tqdm 
tagged_data = []
for d in tqdm(docs,position=0):
    tagged_data.append(word_tokenize(d))


100%|██████████| 875600/875600 [04:33<00:00, 3204.88it/s]


In [77]:
%env PYTHONHASHSEED=0
max_epochs = 10
vec_size = 300

model = Word2Vec(size=vec_size, min_count=10, window=3, sg=1, seed=0, workers = 4)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

print("Model is Ready")

env: PYTHONHASHSEED=0
Model is Ready


In [30]:
from gensim.models.fasttext import FastText

%env PYTHONHASHSEED=0
max_epochs = 10
vec_size = 100

FTmodel = FastText(size=vec_size, min_count=10, window=3, sg=1, seed=0, workers = 4)
FTmodel.build_vocab(tagged_data)
FTmodel.train(tagged_data, total_examples=FTmodel.corpus_count, epochs=FTmodel.epochs)

print("Model is Ready")

env: PYTHONHASHSEED=0
Model is Ready


In [42]:
movie_reviews = {}
for movie in tqdm(ground_truth):
    m_id = id_rows[movie]
    
    movie_reviews[movie] = utils.distance_w2v (model,m_id,review_ids,50000)

100%|██████████| 785/785 [00:27<00:00, 28.76it/s]


In [54]:
for KK in [1,2,3,5,10,20,50,50000]: 
    i = 0
    precision,recall,fs = 0,0,0
    MAP, MRR, hasP = 0,0,0

    for movie in movie_reviews:
        if movie not in ground_truth: continue
        i+=1
        preds = [f for (f,j) in movie_reviews[movie]][0:KK]
        golds = [f for f in ground_truth[movie]]

        MAP += utils.MAP_K(golds,preds)
        MRR += utils.MRR(golds,preds)
        hasP += utils.HAS_POSITIVE(golds,preds)
        
    print('\n#################### ' + str(KK) + ' ###########################\n')
    print('MRR:',MRR/i,'MAP:',MAP/i, 'HAS POSITIVE:', hasP/i)


#################### 1 ###########################

MRR: 0.5261146496815287 MAP: 0.2624203821656051 HAS POSITIVE: 0.5261146496815287

#################### 2 ###########################

MRR: 0.5738853503184713 MAP: 0.3668789808917197 HAS POSITIVE: 0.621656050955414

#################### 3 ###########################

MRR: 0.5840764331210189 MAP: 0.3922505307855624 HAS POSITIVE: 0.6522292993630573

#################### 5 ###########################

MRR: 0.6015286624203818 MAP: 0.4193842887473456 HAS POSITIVE: 0.7273885350318471

#################### 10 ###########################

MRR: 0.6096769790718832 MAP: 0.43710924072389 HAS POSITIVE: 0.7872611464968153

#################### 20 ###########################

MRR: 0.6139999515087263 MAP: 0.4450565207258278 HAS POSITIVE: 0.8509554140127389

#################### 50 ###########################

MRR: 0.6159639997987735 MAP: 0.44909400602079713 HAS POSITIVE: 0.910828025477707

#################### 50000 ###########################

MRR: 