In [137]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import json
import re
import nltk
import math
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import Counter,defaultdict
import pickle
from itertools import chain
from bs4 import BeautifulSoup

# DEFS ----------------------------------------------------------------------------------------------------
def load_data(file):
    
    with open(file) as f:
        json_block = []
        for line in f:
            json_block.append(json.loads(line))
            
    return pd.DataFrame(json_block)

def tokenize(text):
    return [re.sub(r'[^\w\s]','',w) for w in nltk.word_tokenize(text.lower()) if re.sub(r'[^\w\s]','',w) != '']
   
def remove_stopwords(tokens):
    en_stopwords = set(stopwords.words('english'))
    return [word for word in tokens if word not in en_stopwords]

def stemmer(tokens):
    stemmer = SnowballStemmer("english")
    return [stemmer.stem(token) for token in tokens]

def find_relative_news(query):
    relative = []
    for index, row in news.iterrows():
        if query.lower() in row.title.lower():
            relative.append(row.url)
        elif query.lower() in row.content.lower():
            relative.append(row.url)
    return relative

def process_filmography(filmography_object):
    filmography = []
    for item in filmography_object:
        movie_instance = {}
        soup = BeautifulSoup(item, 'html.parser')
        movie_instance['url'] = 'https://www.imdb.com' + soup.b.a.get('href')
        movie_instance['data'] = ' '.join(soup.get_text().strip().replace('\n',' ').split(' '))
        filmography.append(movie_instance)
    
    return filmography

def create_inv_indexes(df):
    invertedIndexFreq = defaultdict(Counter)
    invertedIndexPos = defaultdict(dict)
    corpusInfo = defaultdict(dict)
    
    corpusInfo['num_docs'] = df.shape[0]
    
    for index, row in df.iterrows():
        corpusInfo['doc_lengths'][row['url']] = len(row['tokenized'])
        
        for w in row['tokenized']:
            invertedIndexFreq[w][row['url']]+=1
            invertedIndexPos[w][row['url']] = [i for i, j in enumerate(row['tokenized']) if w == j]
            
    return invertedIndexFreq, invertedIndexPos, corpusInfo

In [139]:
# LOAD ----------------------------------------------------------------------------------------------------
actors_jl = '../data/actors.jl'
movies_jl = '../data/movies.jl'
tmz_jl = '../data/news_tmz.jl'
hollywoodlife_jl = '../data/news_hollywoodlife.jl'
movieweb_jl = '../data/news_movieweb.jl'

#actors = load_data(actors_jl)
#movies = load_data(movies_jl)
tmz = load_data(tmz_jl)
hollywoodlife = load_data(hollywoodlife_jl)
movieweb = load_data(movieweb_jl)

In [140]:
# PREPROCESS -----------------------------------------------------------------------------------------------
#actors['filmography'] = actors.apply(lambda row: process_filmography(row.filmography), axis=1)
#actors['movie_urls'] = actors.apply(lambda row: ['https://www.imdb.com' + url for url in row.movie_urls], axis=1)

#movies['tokenized'] = movies.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.year + ' ' + ' '.join(row.genres) + ' ' + ' '.join(row.reviews)))), axis=1)

tmz['tokenized'] = tmz.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)
hollywoodlife['tokenized'] = hollywoodlife.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)
movieweb['tokenized'] = movieweb.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)

In [None]:
# MATCH ------------------------------------------------------------------------------------------------------

# TODO: MATCH ACTOR TO NEWS (BOTH ON ACTOR NAME AS WELL AS ON FILMOGRAPHY)
# TODO: MATCH ACTOR TO MOVIE CONTENT

# TODO: TOKENIZE ALL ACTOR CONTENT

In [None]:
# INDEX ------------------------------------------------------------------------------------------------------

# TODO: CREATE INVERTED INDEX - FREQ ACTOR
# TODO: CREATE INVERTED INDEX - POS ACTOR
# TODO: CREATE INVERTED INDEX - WEIGHT ACTOR

In [None]:
# EXPORT ------------------------------------------------------------------------------------------------------

# TODO: SAVE INV INDEX - FREQ AS PICKLE
# TODO: SAVE INV INDEX - POS AS PICKLE
# TODO: SAVE INV INDEX - WEIGHT AS PICKLE
# TODO: SAVE ACTOR DF AS JSON
# TODO: SAVE MOVIE DF AS JSON
# TODO: SAVE NEWS DF AS JSON

In [111]:
rel_news = [item for l in news[news.url.isin(find_relative_news('naomi scott'))].tokenized.values for item in l]

In [119]:
actors['movie_urls'] = actors.apply(lambda row: ['https://www.imdb.com' + url for url in row.movie_urls], axis=1)

In [135]:
def tokenize_actor_item(name, bio):
    tokenized = []
    
    rel_news = news[news.url.isin(find_relative_news(name))]
    tokenized += [item for l in rel_news.tokenized.values for item in l]
    
    #rel_movies = 
    
    tokenized += stemmer(remove_stopwords(tokenize(bio)))
    
    return tokenized

actors['tokenized'] = actors.apply(lambda row: tokenize_actor_item(row["name"], row.bio_imdb+' '+row.bio_rottom+' '+row.birthday+' '+row.birthplace), axis=1)


In [146]:
process_filmography(actors.head(1).filmography.values[0])

[{'url': 'https://www.imdb.com/title/tt5033998/?ref_=nm_flmg_act_1',
  'data': "2019  Charlie's Angels (post-production)"},
 {'url': 'https://www.imdb.com/title/tt6139732/?ref_=nm_flmg_act_2',
  'data': '2019  Aladdin (post-production)  Jasmine'},
 {'url': 'https://www.imdb.com/title/tt7224848/?ref_=nm_flmg_act_3',
  'data': "2017  Britain's Most Evil Killers (TV Series documentary)  Reconstruction Actor  - Dennis Nilsen (2017) ... Reconstruction Actor"},
 {'url': 'https://www.imdb.com/title/tt3717490/?ref_=nm_flmg_act_4',
  'data': '2017  Power Rangers  Kimberly (Pink Ranger)'},
 {'url': 'https://www.imdb.com/title/tt8311258/?ref_=nm_flmg_act_5',
  'data': "2017  Naomi Scott: Lover's Lies (Short)"},
 {'url': 'https://www.imdb.com/title/tt0874608/?ref_=nm_flmg_act_6',
  'data': '2015  Inspector Lewis (TV Series)  Sahira Desai  - One for Sorrow: Part 2 (2015) ... Sahira Desai   - One for Sorrow: Part 1 (2015) ... Sahira Desai'},
 {'url': 'https://www.imdb.com/title/tt2006295/?ref_=nm_fl

In [None]:
with open(source[:-5]+'-invertedIndexFreq.pickle', 'wb') as handle:
    pickle.dump(invertedIndexFreq, handle, protocol=pickle.HIGHEST_PROTOCOL)