In [None]:
import json
import re
import string
import spacy
from spacy.matcher import Matcher

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords

# Run the 1st time
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [None]:
year = 2015
nominees = run_nominees(year)

In [None]:
for key in nominees:
    if 'act' in key:
        print(key,':', nominees[key])

In [None]:
category = 'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television'
tweets = get_tweet_data(year)

In [None]:
# filter for tweets which contain the category name regex
category_tweets = tweets_contain(category, tweets)

# create a dictionary of pronoun tokens
if 'actor' in category or 'actress' in category:
    pronouns_dictionary = get_person_names(list_of_tweets=category_tweets)
else:
    pronouns_dictionary = get_NNP(category_tweets)

# remove tokens contained in the category name
filtered_category_tweets = remove_category_tokens(category, pronouns_dictionary)

# get pronouns with >some_percentile frequency 
category_nominees = get_top_percent(filtered_category_tweets, percentile=0.8)

In [None]:
print(pronouns_dictionary)
print(category_nominees)

In [None]:
print(len(category_tweets))
category_tweets[0:10]

In [None]:
OFFICIAL_AWARDS_1315 = ['cecil b. demille award', 'best motion picture - drama', 'best performance by an actress in a motion picture - drama', 'best performance by an actor in a motion picture - drama', 'best motion picture - comedy or musical', 'best performance by an actress in a motion picture - comedy or musical', 'best performance by an actor in a motion picture - comedy or musical', 'best animated feature film', 'best foreign language film', 'best performance by an actress in a supporting role in a motion picture', 'best performance by an actor in a supporting role in a motion picture', 'best director - motion picture', 'best screenplay - motion picture', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best performance by an actress in a television series - drama', 'best performance by an actor in a television series - drama', 'best television series - comedy or musical', 'best performance by an actress in a television series - comedy or musical', 'best performance by an actor in a television series - comedy or musical', 'best mini-series or motion picture made for television', 'best performance by an actress in a mini-series or motion picture made for television', 'best performance by an actor in a mini-series or motion picture made for television', 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']
# OFFICIAL_AWARDS_1315 = ['best performance by an actress in a television series - comedy or musical']

d = {}
for category in OFFICIAL_AWARDS_1315:
    category_tweets = tweets_contain(category, tweets)
    d[category] = len(category_tweets)

for key in d:
    print(key,':', d[key])

In [None]:
category_tweets

In [None]:
old_list = tweets
# for word in ['actor', 'supporting', 'series', 'motion picture']:
for word in ['ed', 'harris']:
    new_list = []
    for tweet in old_list:
        if word in tweet.lower():
            new_list.append(tweet)
    old_list = new_list
    
for x in old_list:
    print(x)
print(len(old_list))

In [None]:
def get_tweet_data(year):
    # load tweet data
    file_string = 'gg' + str(year) + '.json'
    tweets = {}
    with open(file_string, 'r') as f:
        tweets = json.load(f)
    
    # extract 'text' field from tweets
    tweets = [tweet['text'] for tweet in tweets]
    
    # remove @,# tokens
    tweets = [remove_symbols(tweet) for tweet in tweets]
    
    return tweets

def remove_symbols(a_tweet):
    entity_prefixes = ['@','#','(',')']
    words = []
    for word in a_tweet.split():
        word = word.strip()
        if word and word.lower() != 'rt':
            if word[0].lower() not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

def tweets_contain(category_name, tweets):
    stop_words = set(stopwords.words('english'))
    stop_words.add('-')
    stop_words.add('performance')
    stop_words.add('comedy')
    stop_words.add('television')

    tokens = nltk.word_tokenize(category_name)
    no_stop_words = [w for w in tokens if not w in stop_words] 
    
    regex = r''
    for token in no_stop_words:
        regex += token
        regex += '.*?'
        
#     regex = category_name
    r = re.compile(regex, re.IGNORECASE)
    filtered_list = list(filter(r.search, tweets))
    return filtered_list

def get_NNP(tweets_list):
    NNP_dict = {}
    for tweet in tweets_list:
        tokens = nltk.word_tokenize(tweet)
        tags = nltk.pos_tag(tokens)
        for word in tags:
            if word[1] in ['NNP']:
                name = word[0].lower()
                if name in NNP_dict:
                    NNP_dict[name] += 1
                else:
                    NNP_dict[name] = 1
    return NNP_dict

def get_top_percent(dictionary, percentile):
    if dictionary == {}:
        return []
    max_val = max(dictionary.values())
    threshold = max_val * percentile
    result = []
    for key in dictionary:
        if dictionary[key] > threshold:
            result.append(key)
    return result

def remove_category_tokens(category, pronouns_dict):
    tokens = nltk.word_tokenize(category) + ['rt']
    tokens += ['motion picture', 'golden', 'globes', 'television series', 'tv series', 'mini-']
    for token in tokens:
        if token in pronouns_dict:
            pronouns_dict[token] = 0
    return pronouns_dict

def get_category_nominees(category, tweets):
    # filter for tweets which contain the category name regex
    category_tweets = tweets_contain(category, tweets)

    # create a dictionary of person names/film titles
    if 'actor' in category or 'actress' in category:
        pronouns_dictionary = get_person_names(list_of_tweets=category_tweets)
    else:
        pronouns_dictionary = get_NNP(category_tweets)

    # remove tokens contained in the category name
    filtered_category_tweets = remove_category_tokens(category, pronouns_dictionary)

    # get pronouns with >some_percentile frequency 
    category_nominees = get_top_percent(filtered_category_tweets, percentile=0.9)
    
    return category_nominees

def run_nominees(year):
    tweets = get_tweet_data(year)
    
    seperator = ' '
    nominees = {}
    OFFICIAL_AWARDS_1315 = ['cecil b. demille award', 'best motion picture - drama', 'best performance by an actress in a motion picture - drama', 'best performance by an actor in a motion picture - drama', 'best motion picture - comedy or musical', 'best performance by an actress in a motion picture - comedy or musical', 'best performance by an actor in a motion picture - comedy or musical', 'best animated feature film', 'best foreign language film', 'best performance by an actress in a supporting role in a motion picture', 'best performance by an actor in a supporting role in a motion picture', 'best director - motion picture', 'best screenplay - motion picture', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best performance by an actress in a television series - drama', 'best performance by an actor in a television series - drama', 'best television series - comedy or musical', 'best performance by an actress in a television series - comedy or musical', 'best performance by an actor in a television series - comedy or musical', 'best mini-series or motion picture made for television', 'best performance by an actress in a mini-series or motion picture made for television', 'best performance by an actor in a mini-series or motion picture made for television', 'best performance by an actress in a supporting role in a series, mini-series or motion picture made for television', 'best performance by an actor in a supporting role in a series, mini-series or motion picture made for television']
    OFFICIAL_AWARDS_1819 = ['best motion picture - drama', 'best motion picture - musical or comedy', 'best performance by an actress in a motion picture - drama', 'best performance by an actor in a motion picture - drama', 'best performance by an actress in a motion picture - musical or comedy', 'best performance by an actor in a motion picture - musical or comedy', 'best performance by an actress in a supporting role in any motion picture', 'best performance by an actor in a supporting role in any motion picture', 'best director - motion picture', 'best screenplay - motion picture', 'best motion picture - animated', 'best motion picture - foreign language', 'best original score - motion picture', 'best original song - motion picture', 'best television series - drama', 'best television series - musical or comedy', 'best television limited series or motion picture made for television', 'best performance by an actress in a limited series or a motion picture made for television', 'best performance by an actor in a limited series or a motion picture made for television', 'best performance by an actress in a television series - drama', 'best performance by an actor in a television series - drama', 'best performance by an actress in a television series - musical or comedy', 'best performance by an actor in a television series - musical or comedy', 'best performance by an actress in a supporting role in a series, limited series or motion picture made for television', 'best performance by an actor in a supporting role in a series, limited series or motion picture made for television', 'cecil b. demille award']

    if year in [2013, 2015]:
        categories = OFFICIAL_AWARDS_1315
    else:
        categories = OFFICIAL_AWARDS_1819
    
    for category_name in categories:
#         print('.')
        category_nominees_list = get_category_nominees(category_name, tweets)
        category_nominees_string = seperator.join(category_nominees_list)
        nominees[category_name] = category_nominees_string
        print(category_name, ':', category_nominees_string)
    return nominees




def get_film_titles(list_of_tweets):
    '''
    Input: a list of strings
    Returns: a dictionary where keys = film titles, values = number of times references in the list of tweets
    *Compares tweet content to IMDB database of movie/TV titles 
    '''
    titles_dictionary = {}
    for tweet in list_of_tweets:
        title = extract_title(tweet).lower()
        if title:
            if title in titles_dictionary:
                titles_dictionary[name] += 1
            else:
                titles_dictionary[name] = 1
    return titles_dictionary
    
def get_person_names(list_of_tweets):
    '''
    Input: a list of strings
    Returns: a dictionary where keys = actor/actress names, values = number of times the key is references in the list of tweets
    '''
    nlp = spacy.load('en_core_web_sm')
    names_dictionary = {}
    for tweet in list_of_tweets:
        name = extract_full_name(nlp(tweet), nlp)
        if name:
            name = name.lower()
            if name in names_dictionary:
                names_dictionary[name] += 1
            else:
                names_dictionary[name] = 1
    return names_dictionary
    
    
def extract_full_name(nlp_doc, nlp):
    matcher = Matcher(nlp.vocab)
    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}]
    matcher.add('FULL_NAME', None, pattern)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text
    
def remove_names(string):
    nlp = spacy.load('en_core_web_sm')
    for ent in nlp(string).ents:
        string = string.replace(str(ent), '')
    return string

In [None]:
from fuzzywuzzy import fuzz

def find_movie_title(string):
    imdb_movies = 
    max_ratio = 0
    max_title = ''
    for title in imdb_movies:
        r = fuzz.token_sort_ratio(string, title)
        if r > max_ratio:
            max_ratio = r
            max_title = title
    return max_title

Str1 = "life of pi"
Str2 = "pi life golden"
Ratio = fuzz.ratio(Str1.lower(),Str2.lower())
Partial_Ratio = fuzz.partial_ratio(Str1.lower(),Str2.lower())
Token_Sort_Ratio = fuzz.token_sort_ratio(Str1,Str2)
print(Ratio)
print(Partial_Ratio)
print(Token_Sort_Ratio)


In [None]:
from imdb import IMDb

# create an instance of the IMDb class
ia = IMDb()

# get a movie
movie = ia.get_movie('0133093')

# print the names of the directors of the movie
print('Directors:')
for director in movie['directors']:
    print(director['name'])

# print the genres of the movie
print('Genres:')
for genre in movie['genres']:
    print(genre)

# search for a person name
people = ia.search_person('Mel Gibson')
for person in people:
   print(person.personID, person['name'])

In [None]:
m1 = 'skyfall adele'
m2 = 'quentin tarantino django'

# movies = ia.search_movie('django ed')
print(ia.search_movie(m2))
print(ia.search_keyword(m2))

In [None]:
movies = ia.search_movie(m2)
for i in range(len(movies)):
    print(movies[i]['year'])

In [None]:
print(ia.search_movie('django '))
print(ia.search_movie('django unchained')[0]['title'])


In [None]:
m2 = 'quentin tarantino django'
tokens = nltk.word_tokenize(m2)
tags = nltk.pos_tag(tokens)

print(tags)

In [None]:
m2 = 'quentin tarantino django the bot test me David'
remove_names(m2)
