In [26]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import json
import re
import nltk
import math
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import Counter,defaultdict
import pickle
from itertools import chain
from bs4 import BeautifulSoup

# DEFS ----------------------------------------------------------------------------------------------------
def load_data(file):
    
    with open(file) as f:
        json_block = []
        for line in f:
            json_block.append(json.loads(line))
            
    return pd.DataFrame(json_block)

def tokenize(text):
    return [re.sub(r'[^\w\s]','',w) for w in nltk.word_tokenize(text.lower()) if re.sub(r'[^\w\s]','',w) != '']
   
def remove_stopwords(tokens):
    en_stopwords = set(stopwords.words('english'))
    return [word for word in tokens if word not in en_stopwords]

def stemmer(tokens):
    stemmer = SnowballStemmer("english")
    return [stemmer.stem(token) for token in tokens]

def process_filmography(filmography_object):
    filmography = []
    for item in filmography_object:
        movie_instance = {}
        soup = BeautifulSoup(item, 'html.parser')
        movie_instance['url'] = 'https://www.imdb.com' + soup.b.a.get('href')
        movie_instance['title'] = soup.b.a.get_text()
        movie_instance['data'] = ' '.join(soup.get_text().strip().replace('\n',' ').split(' '))
        filmography.append(movie_instance)
    
    return filmography
    
def retrieve_relative_news_collection(name, filmography):
    list_terms = [name]
    list_terms += [movie["title"] for movie in filmography][:3]
    
    result = {}
    relative_urls = []
    relative_text = ''
    
    for index, row in news.iterrows():
        for term in list_terms:
            if (term.lower() in row.title.lower()) or (term.lower() in row.content.lower()):
                relative_urls.append(row.url)
                relative_text = relative_text + ' ' + row.title + ' ' + row.content
                
    result['urls'] = list(set(relative_urls))
    result['text'] = relative_text
    return result

def create_inv_indexes(df):
    invertedIndexFreq = defaultdict(Counter)
    invertedIndexPos = defaultdict(dict)
    corpusInfo = defaultdict(dict)
    
    corpusInfo['num_docs'] = df.shape[0]
    
    for index, row in df.iterrows():
        corpusInfo['doc_lengths'][row['url']] = len(row['tokenized'])
        
        for w in row['tokenized']:
            invertedIndexFreq[w][row['url']]+=1
            invertedIndexPos[w][row['url']] = [i for i, j in enumerate(row['tokenized']) if w == j]
            
    return invertedIndexFreq, invertedIndexPos, corpusInfo

In [2]:
# LOAD ----------------------------------------------------------------------------------------------------
actors_jl = '../data/actors.jl'
movies_jl = '../data/movies.jl'
tmz_jl = '../data/news_tmz.jl'
hollywoodlife_jl = '../data/news_hollywoodlife.jl'
movieweb_jl = '../data/news_movieweb.jl'

actors = load_data(actors_jl)
movies = load_data(movies_jl)
tmz = load_data(tmz_jl)
hollywoodlife = load_data(hollywoodlife_jl)
movieweb = load_data(movieweb_jl)

In [3]:
# PREPROCESS -----------------------------------------------------------------------------------------------
actors['filmography'] = actors.apply(lambda row: process_filmography(row.filmography), axis=1)

In [None]:
#actors['movie_urls'] = actors.apply(lambda row: ['https://www.imdb.com' + url for url in row.movie_urls], axis=1)

In [4]:
movies['tokenized'] = movies.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.year + ' ' + ' '.join(row.genres) + ' ' + ' '.join(row.reviews)))), axis=1)

In [5]:
tmz['tokenized'] = tmz.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)

In [6]:
hollywoodlife['tokenized'] = hollywoodlife.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)

In [7]:
movieweb['tokenized'] = movieweb.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)

In [17]:
print(tmz.shape, hollywoodlife.shape, movieweb.shape)

((9100, 5), (10484, 5), (11980, 5))


In [15]:
news = tmz.append(hollywoodlife)
news = news.append(movieweb)
news.shape

(31564, 5)

In [63]:
# MATCH ------------------------------------------------------------------------------------------------------


In [70]:
actors['news'] = actors.apply(lambda row: retrieve_relative_news_collection(row["name"],row["filmography"]), axis=1)

KeyboardInterrupt: 

In [None]:
actors['movies'] = actors.apply(lambda row: retrieve_relative_news_collection(row["name"],row["filmography"]), axis=1)

In [None]:
def retrieve_relative_movie_collection(filmography):
    list_movies = [movie["title"] for movie in filmography][:10]
    
    movies[movies["title"].isin(list_movies)]
    relative_urls = []
    relative_text = ''
    for index, row in news.iterrows():
        for term in list_terms:
            if (term.lower() in row.title.lower()) or (term.lower() in row.content.lower()):
                relative_urls.append(row.url)
                relative_text = relative_text + ' ' + row.title + ' ' + row.content
    return list(set(relative_urls)), relative_text

In [71]:
movies.head()

Unnamed: 0,genres,reviews,title,url_imdb,url_img,url_metacritic,url_rottom,year,tokenized
0,"[Animation, Adventure, Adventure, Drama, Family]","[No consensus yet., \n ...",Call of the Wild,https://www.imdb.com/title/tt7504726/?ref_=nm_...,,https://www.metacritic.com/movie/call-of-the-wild,https://www.rottentomatoes.com/m/call_of_the_wild,2019,"[call, wild, 2019, anim, adventur, adventur, d..."
1,"[Drama, Fantasy, Romance, Drama, Romance]",[The Age of Adaline ruminates on mortality le...,The Age of Adaline,https://www.imdb.com/title/tt1655441/?ref_=nm_...,https://m.media-amazon.com/images/M/MV5BMTAzMT...,https://www.metacritic.com/movie/the-age-of-ad...,https://www.rottentomatoes.com/m/the_age_of_ad...,2015,"[age, adalin, 2015, drama, fantasi, romanc, dr..."
2,"[Crime, Drama, Drama]",[Crossing Over is flagrant and heavy-handed a...,Crossing Over,https://www.imdb.com/title/tt0924129/?ref_=nm_...,https://m.media-amazon.com/images/M/MV5BMjAyMD...,https://www.metacritic.com/movie/crossing-over,https://www.rottentomatoes.com/m/crossing_over,2009,"[cross, 2009, crime, drama, drama, cross, flag..."
3,"[Action, Comedy, Crime, Action, Thriller, Come...",[Hollywood Homicide suffers from too many subp...,Hollywood Homicide,https://www.imdb.com/title/tt0329717/?ref_=nm_...,https://m.media-amazon.com/images/M/MV5BMTU5Mj...,https://www.metacritic.com/movie/hollywood-hom...,https://www.rottentomatoes.com/m/hollywood_hom...,2003,"[hollywood, homicid, 2003, action, comedi, cri..."
4,"[Drama, Mystery, Sci-Fi, Sci-Fi, Drama, Myster...","[Visually stunning and narratively satisfying,...",Blade Runner 2049,https://www.imdb.com/title/tt1856101/?ref_=nm_...,https://m.media-amazon.com/images/M/MV5BNzA1Nj...,https://www.metacritic.com/movie/blade-runner-...,https://www.rottentomatoes.com/m/blade_runner_...,2017,"[blade, runner, 2049, 2017, drama, mysteri, sc..."


In [None]:
# INDEX ------------------------------------------------------------------------------------------------------

# TODO: CREATE INVERTED INDEX - FREQ ACTOR
# TODO: CREATE INVERTED INDEX - POS ACTOR
# TODO: CREATE INVERTED INDEX - WEIGHT ACTOR

In [None]:
# EXPORT ------------------------------------------------------------------------------------------------------

# TODO: SAVE INV INDEX - FREQ AS PICKLE
# TODO: SAVE INV INDEX - POS AS PICKLE
# TODO: SAVE INV INDEX - WEIGHT AS PICKLE
# TODO: SAVE ACTOR DF AS JSON
# TODO: SAVE MOVIE DF AS JSON
# TODO: SAVE NEWS DF AS JSON

In [None]:
actors.head()

In [None]:
movies.head()

In [None]:
actors['movie_urls'] = actors.apply(lambda row: ['https://www.imdb.com' + url for url in row.movie_urls], axis=1)

In [None]:
def tokenize_actor_item(name, bio):
    tokenized = []
    
    rel_news = news[news.url.isin(find_relative_news(name))]
    tokenized += [item for l in rel_news.tokenized.values for item in l]
    
    #rel_movies = 
    
    tokenized += stemmer(remove_stopwords(tokenize(bio)))
    
    return tokenized

actors['tokenized'] = actors.apply(lambda row: tokenize_actor_item(row["name"], row.bio_imdb+' '+row.bio_rottom+' '+row.birthday+' '+row.birthplace), axis=1)


In [None]:
process_filmography(actors.head(1).filmography.values[0])

In [None]:
with open(source[:-5]+'-invertedIndexFreq.pickle', 'wb') as handle:
    pickle.dump(invertedIndexFreq, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import ast

with open('../data/movie_urls.txt') as f:
    urls = ast.literal_eval(f.readlines()[0])

start_urls = ['https://www.imdb.com' + url.strip() for url in urls]

In [None]:
start_urls

In [None]:
soup = BeautifulSoup(actors.head(1).filmography.values[0][0], 'html.parser')
soup.b.a.get_text()