In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import json
import re
import nltk
import math
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import Counter,defaultdict
import pickle
from itertools import chain
from bs4 import BeautifulSoup

# DEFS ----------------------------------------------------------------------------------------------------
def load_data(file):
    
    with open(file) as f:
        json_block = []
        for line in f:
            json_block.append(json.loads(line))
            
    return pd.DataFrame(json_block)

def tokenize(text):
    return [re.sub(r'[^\w\s]','',w) for w in nltk.word_tokenize(text.lower()) if re.sub(r'[^\w\s]','',w) != '']
   
def remove_stopwords(tokens):
    en_stopwords = set(stopwords.words('english'))
    return [word for word in tokens if word not in en_stopwords]

def stemmer(tokens):
    stemmer = SnowballStemmer("english")
    return [stemmer.stem(token) for token in tokens]

def process_filmography(filmography_object):
    filmography = []
    for item in filmography_object:
        movie_instance = {}
        soup = BeautifulSoup(item, 'html.parser')
        movie_instance['url'] = 'https://www.imdb.com' + soup.b.a.get('href')
        movie_instance['title'] = soup.b.a.get_text()
        movie_instance['data'] = ' '.join(soup.get_text().strip().replace('\n',' ').split(' '))
        filmography.append(movie_instance)
    
    return filmography
    
def retrieve_relative_news_collection(name):
    selection = news[news['content'].str.contains(name)]
    
    #result = {}
    #result['urls'] = selection.url.values
    
    tokens = []
    for l in selection.tokenized:
        tokens += l
    
    #result['text'] = tokens
    return tokens

def retrieve_relative_movies_collection(filmography):
    list_movies = [movie["title"] for movie in filmography][:3]
    
    selection = movies[movies["title"].isin(list_movies)]
    
    #result = {}
    #result['urls'] = selection.url_imdb.values
    #result['genres'] = [genre for genres in selection.genres.values for genre in genres]
    
    tokens = []
    for l in selection.tokenized:
        tokens += l
    
    #result['text'] = tokens
    return tokens

def create_inv_indexes(df):
    invertedIndexFreq = defaultdict(Counter)
    invertedIndexPos = defaultdict(dict)
    corpusInfo = defaultdict(dict)
    
    corpusInfo['num_docs'] = df.shape[0]
    
    for index, row in df.iterrows():
        corpusInfo['doc_lengths'][row['name']] = len(row['tokenized'])
        
        for w in row['tokenized']:
            invertedIndexFreq[w][row['name']]+=1
            invertedIndexPos[w][row['name']] = [i for i, j in enumerate(row['tokenized']) if w == j]
            
    return invertedIndexFreq, invertedIndexPos, corpusInfo

def tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing):
    return (doc_freq / doc_length) * math.log(n_docs_total / (1 + n_docs_containing))

In [2]:
# LOAD ----------------------------------------------------------------------------------------------------
actors_jl = '../data/actors.jl'
movies_jl = '../data/movies.jl'
tmz_jl = '../data/news_tmz.jl'
hollywoodlife_jl = '../data/news_hollywoodlife.jl'
movieweb_jl = '../data/news_movieweb.jl'

actors = load_data(actors_jl)
movies = load_data(movies_jl)
tmz = load_data(tmz_jl)
hollywoodlife = load_data(hollywoodlife_jl)
movieweb = load_data(movieweb_jl)

In [3]:
# PREPROCESS -----------------------------------------------------------------------------------------------
actors['filmography'] = actors.apply(lambda row: process_filmography(row.filmography), axis=1)

In [None]:
#actors['movie_urls'] = actors.apply(lambda row: ['https://www.imdb.com' + url for url in row.movie_urls], axis=1)

In [4]:
movies['tokenized'] = movies.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.year + ' ' + ' '.join(row.genres) + ' ' + ' '.join(row.reviews)))), axis=1)

In [5]:
tmz['tokenized'] = tmz.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)

In [6]:
hollywoodlife['tokenized'] = hollywoodlife.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)

In [7]:
movieweb['tokenized'] = movieweb.apply(lambda row: stemmer(remove_stopwords(tokenize(row.title + ' ' + row.content))), axis=1)

In [8]:
news = tmz.append(hollywoodlife)
news = news.append(movieweb)

In [13]:
actors.head()

Unnamed: 0,bio_imdb,bio_rottom,birthday,birthplace,filmography,movie_urls,name,quotes_brainyquotes,quotes_imdb,quotes_rottom,url_imdb,url_img,url_rottom
0,"Harrison Ford was born on July 13, 1942 in Chi...",If Harrison Ford had listened to the advice of...,"Jul 13, 1942","Chicago, Illinois",[{u'url': u'https://www.imdb.com/title/tt14627...,"[/title/tt1462764/?ref_=nm_flmg_act_1, /title/...",Harrison Ford,[Nature doesn't need people - people need natu...,"[It's a little-known fact, but I wanted Han So...",[Jack Stanfield Leave my family alone! Get off...,https://www.imdb.com/name/nm0000148/,https://m.media-amazon.com/images/M/MV5BMTY4Mj...,https://www.rottentomatoes.com/celebrity/harri...
1,Eric Bana was born Eric Banadinovic on August ...,A popular and easygoing Australian comedian wh...,"Aug 9, 1968","Melbourne, Victoria, Australia",[{u'url': u'https://www.imdb.com/title/tt51441...,"[/title/tt5144174/?ref_=nm_flmg_act_1, /title/...",Eric Bana,[I look my best when I take my helmet off afte...,"[I wanted to be a mechanic. When I was 14, I w...","[Dr. David Banner Sleep now, Bruce, and fo...",https://www.imdb.com/name/nm0051509/,https://m.media-amazon.com/images/M/MV5BMjcxMz...,https://www.rottentomatoes.com/celebrity/eric_...
2,"At a consistently lean 6' 2"", green-eyed Timot...",British actor Timothy Dalton has excelled in r...,"Mar 21, 1944","Colwyn Bay, Wales, UK",[{u'url': u'https://www.imdb.com/title/tt19793...,"[/title/tt1979376/?ref_=nm_flmg_act_1, /title/...",Timothy Dalton,"[You can't relate to a superhero, to a superma...",[On playing a character: You can't relate to a...,[Jenny Blake Oh my God. Neville Sinclair's a.....,https://www.imdb.com/name/nm0001096/,https://m.media-amazon.com/images/M/MV5BNDE5ND...,https://www.rottentomatoes.com/celebrity/timot...
3,"Catherine Zeta-Jones was born September 25, 19...","Both exotic and classic, Wales-born actress Ca...","Sep 25, 1969","Swansea, West Glamorgan, Wales",[{u'url': u'https://www.imdb.com/title/tt83968...,"[/title/tt8396890/?ref_=nm_flmg_act_1, /title/...",Catherine Zeta-Jones,[Being glamorous is about strength and confide...,[I used to go around looking as frumpy as poss...,[Chloe Are you telling a woman with pan full o...,https://www.imdb.com/name/nm0001876/,https://m.media-amazon.com/images/M/MV5BZGE4Mz...,https://www.rottentomatoes.com/celebrity/cathe...
4,Kit Harington was born Christopher Catesby Har...,Descended from a viscount who served as Britai...,Not Available,Not Available,[{u'url': u'https://www.imdb.com/title/tt23864...,"[/title/tt2386490/?ref_=nm_flmg_act_1, /title/...",Kit Harington,"[I'm very lucky, I've got two very loving pare...",[I wear quite fitted clothing. I don't like we...,[Will Holloway If there's something else I hav...,https://www.imdb.com/name/nm3229685/,https://m.media-amazon.com/images/M/MV5BMTA2NT...,https://www.rottentomatoes.com/celebrity/kit_h...


In [18]:
# MATCH ------------------------------------------------------------------------------------------------------
actors_movies = actors[['name','filmography']]
actors_movies['tokenized'] = actors.apply(lambda row: retrieve_relative_movies_collection(row["filmography"]), axis=1)
actors_movies = actors_movies.drop(columns=['filmography'])

In [21]:
actors_news = actors[['name']]
actors_news['tokenized'] = actors.apply(lambda row: retrieve_relative_news_collection(row["name"]), axis=1)


In [22]:
# TOKENIZE ACTOR -----------------------------------------------------------------------------------------------
actors['tokenized'] = actors.apply(lambda row: stemmer(remove_stopwords(tokenize(row["bio_imdb"] 
                                            + ' ' + row["bio_rottom"] + ' ' + row["birthday"] 
                                            + ' ' + row["birthplace"] + ' ' + row["name"]
                                            + ' ' + ' '.join(row["quotes_brainyquotes"])
                                            + ' ' + ' '.join(row["quotes_imdb"])
                                            + ' ' + ' '.join(row["quotes_rottom"])))), axis=1)


In [25]:
# INDEX ------------------------------------------------------------------------------------------------------
# MAIN
main_indexes = create_inv_indexes(actors)
main_ii_freq = main_indexes[0]
main_ii_pos = main_indexes[1]
main_corpus_info = main_indexes[2]

In [26]:
# MOVIES
movies_indexes = create_inv_indexes(actors_movies)
movies_ii_freq = movies_indexes[0]
movies_ii_pos = movies_indexes[1]
movies_corpus_info = movies_indexes[2]

In [27]:
# NEWS
news_indexes = create_inv_indexes(actors_news)
news_ii_freq = news_indexes[0]
news_ii_pos = news_indexes[1]
news_corpus_info = news_indexes[2]

In [28]:
# MERGING
# Merging frequencies
merged_ii_freq = defaultdict(Counter)
for k,v in chain(main_ii_freq.items(), movies_ii_freq.items(), 
                 news_ii_freq.items()):
    
    merged_ii_freq[k].update(v)

In [30]:
with open('../data/ii_freq.pickle', 'wb') as handle:
    pickle.dump(merged_ii_freq, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [31]:
# Merging positional
merged_ii_pos = main_ii_pos

# Merge indexes
for w, dic in movies_ii_pos.items():
    for doc, lis in dic.items():
        merged_ii_pos[w][doc] = lis
        
for w, dic in news_ii_pos.items():
    for doc, lis in dic.items():
        merged_ii_pos[w][doc] = lis

In [33]:
with open('../data/ii_pos.pickle', 'wb') as handle:
    pickle.dump(merged_ii_pos, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [34]:
# Merging corpus infos
merged_ci = main_corpus_info

merged_ci['num_docs'] += movies_corpus_info['num_docs'] 
for doc, length in movies_corpus_info['doc_lengths'].items():
    merged_ci['doc_lengths'][doc] = length
      
merged_ci['num_docs'] += news_corpus_info['num_docs'] 
for doc, length in news_corpus_info['doc_lengths'].items():
    merged_ci['doc_lengths'][doc] = length

In [36]:
# WEIGHTS
merged_ii_weights = defaultdict(Counter)

for w,wv in merged_ii_freq.items():
    n_docs_total = merged_ci['num_docs']
    n_docs_containing = len(wv)
    
    for d_id,dv in wv.items():
        doc_freq = dv
        doc_length = merged_ci['doc_lengths'][d_id]
        
        merged_ii_weights[w][d_id] = tfidf(doc_freq, doc_length, n_docs_total, n_docs_containing)

In [37]:
with open('../data/ii_weights.pickle', 'wb') as handle:
    pickle.dump(merged_ii_weights, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# EXPORT ------------------------------------------------------------------------------------------------------

In [24]:
# DATAFRAMES
with open('../data/actors.pickle', 'wb') as handle:
    pickle.dump(actors, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
with open('../data/movies.pickle', 'wb') as handle:
    pickle.dump(movies, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [11]:
with open('../data/news.pickle', 'wb') as handle:
    pickle.dump(news, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
# TEST ------------------------------------------------------------------------------------------------------
merged_ii_weights['action']


Counter({u'50 Cent': 0.0,
         u'Aaliyah': 0.0,
         u'Aamir Khan': 2.5649493574615367,
         u'Aaron Eckhart': 2.5649493574615367,
         u'Aaron Sorkin': 2.5649493574615367,
         u'Abbie Cornish': 2.5649493574615367,
         u'Adam Baldwin': 2.5649493574615367,
         u'Adam Beach': 0.0,
         u'Adam Brody': 2.5649493574615367,
         u'Adam Lambert': 0.0,
         u'Adam Scott': 2.5649493574615367,
         u'Adewale Akinnuoye-Agbaje': 0.0,
         u'Adrianne Palicki': 0.0,
         u'Agnes Bruckner': 0.0,
         u'Aidan Gillen': 2.5649493574615367,
         u'Aishwarya Rai Bachchan': 0.0,
         u'Akira Kurosawa': 5.1298987149230735,
         u'Alan Arkin': 2.5649493574615367,
         u'Alan Cumming': 2.5649493574615367,
         u'Alan Tudyk': 0.0,
         u'Albert Brooks': 0.0,
         u'Alec Baldwin': 5.1298987149230735,
         u'Alex Pettyfer': 0.0,
         u'Alexandra Paul': 0.0,
         u'Alexis Bledel': 0.0,
         u'Alexis Denisof': 0.