In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval # using literal_eval we can make object from string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
import nltk
import string
from nltk.corpus import stopwords
from collections import Counter
# nltk.download('punkt')
# nltk.download('universal_tagset')
# nltk.download('stopwords')

In [None]:
# first database
movie_genre = pd.read_csv("../data/movieGenres/MovieGenre.csv", encoding="latin1")
# second database
credits = pd.read_csv("../data/movies/credits.csv", encoding="latin1")
keywords = pd.read_csv("../data/movies/keywords.csv", encoding="latin1")
movies_metadata = pd.read_csv('../data/movies/movies_metadata.csv', encoding="latin1")

In [3]:
# we need to rename column imdbId so it matches column name from movies_metadata.csv, for later join
cleaned_movie_genre = movie_genre.rename(columns={'imdbId': 'imdb_id'})
cleaned_movie_genre = cleaned_movie_genre.drop_duplicates(subset=['imdb_id'], keep=False)

cleaned_credits = credits.drop_duplicates(subset=['id'], keep=False)
cleaned_keywords = keywords.drop_duplicates(subset=['id'], keep=False)

cleaned_movies_metadata = movies_metadata.copy()
cleaned_movies_metadata = cleaned_movies_metadata[cleaned_movies_metadata['imdb_id']!='0']
cleaned_movies_metadata['imdb_id'] = cleaned_movies_metadata['imdb_id'].str.replace('tt','')
cleaned_movies_metadata['imdb_id'] = pd.to_numeric(cleaned_movies_metadata['imdb_id'], errors='coerce')
cleaned_movies_metadata = cleaned_movies_metadata[~np.isnan(cleaned_movies_metadata['imdb_id'])]
cleaned_movies_metadata['imdb_id'] = cleaned_movies_metadata['imdb_id'].astype(int)
cleaned_movies_metadata = cleaned_movies_metadata.drop_duplicates(subset=['imdb_id'], keep=False)

cleaned_movies_metadata['id'] = cleaned_movies_metadata['id'].astype(int)

all_tables_merged = pd.merge(cleaned_movies_metadata, cleaned_credits, on='id')
all_tables_merged = pd.merge(all_tables_merged, cleaned_keywords, on='id')
all_tables_merged = pd.merge(all_tables_merged, cleaned_movie_genre, on='imdb_id')

print(all_tables_merged.shape)

all_tables_merged.to_csv('../data/preprocessed_data2.csv', index=False)
# all_tables_merged.head()

(37504, 32)


# Removig columns that are not required

In [None]:
prepared_data = pd.read_csv('../data/preprocessed_data2.csv');
prepared_data.drop('belongs_to_collection', 1, inplace=True)
prepared_data.drop('homepage', 1, inplace=True)
prepared_data.drop('poster_path', 1, inplace=True)
prepared_data.drop('video', 1, inplace=True)
prepared_data.drop('Poster', 1, inplace=True)
prepared_data.drop('original_title', 1, inplace=True)
prepared_data.drop('Title', 1, inplace=True)

In [None]:
prepared_data.head()

We will filter english movies that are released after 1970.

In [5]:
prepared_data = prepared_data[prepared_data.original_language == 'en']
pd.to_datetime(prepared_data.release_date)
prepared_data['year'] = pd.DatetimeIndex(prepared_data['release_date']).year
prepared_data = prepared_data.loc[prepared_data.year > 1970]
print(prepared_data.shape)

prepared_data.to_csv('../data/preprocessed_data2_2.csv', index=False)

(21567, 26)


# We will extract top 6 actors, top 2 comapanies, and director

In [6]:
def get_director(crew):
    for director in crew:
        if director['job'] == 'Director':
            return director['name']
    return

def get_actors(cast, num_of_actors = 6):
    if isinstance(cast, list):
        actors = [actor['name'] for actor in cast]
        if len(actors) >= num_of_actors:
            actors = actors[:num_of_actors]
            return '|'.join(actors)
        return '|'.join(actors)

def get_actors_list(cast, num_of_actors = 6):
    if isinstance(cast, list):
        actors = [actor['name'] for actor in cast]
        if len(actors) >= num_of_actors:
            actors = actors[:num_of_actors]
            return actors
        return actors

def get_production_companies(companies, num_of_companies = 2):
    if isinstance(companies, list):
        prod_companies = [company['name'] for company in companies]
        if len(prod_companies) >= num_of_companies:
            prod_companies = prod_companies[:num_of_companies]
            return '|'.join(prod_companies)
        elif prod_companies == 0:
            return ''
        return '|'.join(prod_companies)

In [7]:
prepared_data.dropna(subset=['production_companies'], inplace=True)
directors = prepared_data['crew'].apply(literal_eval).apply(get_director)
actors = prepared_data['cast'].apply(literal_eval).apply(get_actors)
top_actors = prepared_data['cast'].apply(literal_eval).apply(get_actors_list)
top_2_companies = prepared_data['production_companies'].apply(literal_eval).apply(get_production_companies)

prepared_data['director'] = directors
prepared_data['top_actors'] = top_actors
prepared_data['actors'] = actors
prepared_data['companies'] = top_2_companies
prepared_data.to_csv('../data/preprocessed_data2_3.csv', index=False)

In [289]:
prepared_data.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,overview,popularity,production_companies,production_countries,...,crew,keywords,Imdb Link,IMDB Score,Genre,year,director,top_actors,actors,companies
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,114709,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",...,"[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",http://www.imdb.com/title/tt114709,8.3,Animation|Adventure|Comedy,1995.0,John Lasseter,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,Pixar Animation Studios
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,113497,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",...,"[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",http://www.imdb.com/title/tt113497,6.9,Action|Adventure|Family,1995.0,Joe Johnston,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,TriStar Pictures|Teitler Film
2,False,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,113228,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",...,"[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",http://www.imdb.com/title/tt113228,6.6,Comedy|Romance,1995.0,Howard Deutch,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Walter Matthau|Jack Lemmon|Ann-Margret|Sophia ...,Warner Bros.|Lancaster Gate
3,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,114885,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",...,"[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",http://www.imdb.com/title/tt114885,5.7,Comedy|Drama|Romance,1995.0,Forest Whitaker,"[Whitney Houston, Angela Bassett, Loretta Devi...",Whitney Houston|Angela Bassett|Loretta Devine|...,Twentieth Century Fox Film Corporation
4,False,0,"[{'id': 35, 'name': 'Comedy'}]",11862,113041,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",...,"[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",http://www.imdb.com/title/tt113041,5.9,Comedy|Family|Romance,1995.0,Charles Shyer,"[Steve Martin, Diane Keaton, Martin Short, Kim...",Steve Martin|Diane Keaton|Martin Short|Kimberl...,Sandollar Productions|Touchstone Pictures


In [8]:
actors_flat_list = [item for sublist in prepared_data['top_actors'] for item in sublist]

We will take 200 most frequent actors (the ones that appear in the most number of films)

In [9]:
main_actors = [x for x in actors_flat_list]
main_actors_dict = dict(Counter(main_actors))
main_actors_sorted = sorted(main_actors_dict, key=main_actors_dict.get, reverse=True)
main_actors_sorted = main_actors_sorted[:200]

In [292]:
print(main_actors_sorted)

['Robert De Niro', 'Samuel L. Jackson', 'Michael Caine', 'Christopher Walken', 'Bruce Willis', 'Harvey Keitel', 'Nicolas Cage', 'Steve Buscemi', 'Susan Sarandon', 'Morgan Freeman', 'Donald Sutherland', 'John Goodman', 'Danny Glover', 'Gene Hackman', 'Robert Duvall', 'Alec Baldwin', 'Jeff Bridges', 'Ed Harris', 'Liam Neeson', 'John Hurt', 'Helen Mirren', 'Nick Nolte', 'Eric Roberts', 'James Franco', 'Malcolm McDowell', 'Willem Dafoe', 'Julianne Moore', 'John Cusack', 'Ben Kingsley', 'Robin Williams', 'Anthony Hopkins', 'Woody Harrelson', 'William Hurt', 'Tom Hanks', 'Val Kilmer', 'Christopher Plummer', 'Burt Reynolds', 'Ray Liotta', 'Meryl Streep', 'John Leguizamo', 'Dennis Hopper', 'Christopher Lee', 'Robert Downey Jr.', 'Sam Neill', 'Brian Cox', 'Ned Beatty', 'Christopher Lloyd', 'Whoopi Goldberg', 'Dennis Quaid', 'Dan Aykroyd', 'James Caan', 'Kevin Bacon', 'William H. Macy', 'Johnny Depp', 'Forest Whitaker', 'Martin Sheen', 'John Travolta', 'Lance Henriksen', 'Helena Bonham Carter', 

In [10]:
included_columns = ['IMDB Score', 'overview', 'Genre', 'actors', 'runtime', 'director', 'budget', 'companies', 'top_actors']
preprocessed_data2_3 = pd.read_csv('../data/preprocessed_data2_3.csv')
data_to_use = preprocessed_data2_3[included_columns]
print(data_to_use.shape)
data_to_use = data_to_use.dropna()
print(data_to_use.shape)
data_to_use.head()

(21567, 9)
(15041, 9)


Unnamed: 0,IMDB Score,overview,Genre,actors,runtime,director,budget,companies,top_actors
0,8.3,"Led by Woody, Andy's toys live happily in his ...",Animation|Adventure|Comedy,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,81.0,John Lasseter,30000000,Pixar Animation Studios,"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim..."
1,6.9,When siblings Judy and Peter discover an encha...,Action|Adventure|Family,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,104.0,Joe Johnston,65000000,TriStar Pictures|Teitler Film,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D..."
2,6.6,A family wedding reignites the ancient feud be...,Comedy|Romance,Walter Matthau|Jack Lemmon|Ann-Margret|Sophia ...,101.0,Howard Deutch,0,Warner Bros.|Lancaster Gate,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret..."
3,5.7,"Cheated on, mistreated and stepped on, the wom...",Comedy|Drama|Romance,Whitney Houston|Angela Bassett|Loretta Devine|...,127.0,Forest Whitaker,16000000,Twentieth Century Fox Film Corporation,"['Whitney Houston', 'Angela Bassett', 'Loretta..."
4,5.9,Just when George Banks has recovered from his ...,Comedy|Family|Romance,Steve Martin|Diane Keaton|Martin Short|Kimberl...,106.0,Charles Shyer,0,Sandollar Productions|Touchstone Pictures,"['Steve Martin', 'Diane Keaton', 'Martin Short..."


In [11]:
def check_if_contains(s):
    list_of_actors = [x.strip('[]').split(',')[0] for x in s]
    return not set(list_of_actors).isdisjoint(set(main_actors_sorted))

Drop films that do not have either of this 200 most frequent actors

In [12]:
keep_actors = data_to_use['top_actors'].apply(literal_eval).apply(check_if_contains)
data_to_use['keep_actors'] = keep_actors
data_to_use = data_to_use[data_to_use['keep_actors'] == True]
data_to_use.drop(columns='keep_actors', inplace=True)
data_to_use.shape
# data_to_use.head()

(5612, 9)

In [13]:
def get_words(message, tokenizer = None):
    if tokenizer:
        doc = tokenizer(message)
        return [i.text for i in doc]
    else:
        return message.lower().split('|')

def create_dictionary(messages, min_count = 5, tokenizer = None):
    words_count = {}
    for s in messages:
        words = get_words(s, tokenizer)
        words_unique = list(dict.fromkeys(words))
        for w in words_unique:
            if w not in words_count:
                words_count[w] = 1
            else:
                words_count[w] += 1

    for word in list(words_count.keys()):
        if words_count[word] < min_count:
            del words_count[word]

    index = 1
    for word in list(words_count.keys()):
        words_count[word] = index
        index = index + 1

    return words_count


def transform_text(messages, word_dictionary, tokenizer = None):
    words_arr = np.zeros((len(messages), len(word_dictionary)))
    for i in range(words_arr.shape[0]):
        message_words = get_words(messages[i], tokenizer)
        message_indices = map(word_dictionary.get, message_words)
        for j in message_indices:
            if j is not None:
                words_arr[i, j-1] += 1
    return words_arr

Split into train and test set

In [14]:
train_dataset, test_dataset = train_test_split(data_to_use, test_size=0.3, random_state = 0)

train_data = train_dataset.reset_index(drop=True)
test_data = test_dataset.reset_index(drop=True)

print(train_data.shape)
print(test_data.shape)

train_data.to_csv('../data/train2.csv', index = False)
test_data.to_csv('../data/test2.csv', index = False)

(3928, 9)
(1684, 9)


Now we can take this data and do binarization and then we will apply regression algorithms

In [15]:
train_data = pd.read_csv('../data/train2.csv')
test_data = pd.read_csv('../data/test2.csv')

genre_dictionary = create_dictionary(train_data['Genre'].astype(str).values, min_count=5)
train_genre_matrix = transform_text(train_data['Genre'].astype(str).values, genre_dictionary)
test_genre_matrix = transform_text(test_data['Genre'].astype(str).values, genre_dictionary)

director_dictionary = create_dictionary(train_data['director'].astype(str).values, min_count=5)
train_director_matrix = transform_text(train_data['director'].astype(str).values, director_dictionary)
test_director_matrix = transform_text(test_data['director'].astype(str).values, director_dictionary)

actors_dictionary = create_dictionary(train_data['actors'].astype(str).values, min_count = 5)
train_actors_matrix = transform_text(train_data['actors'].astype(str).values, actors_dictionary)
test_actors_matrix = transform_text(test_data['actors'].astype(str).values, actors_dictionary)

companies_dictionary = create_dictionary(train_data['companies'].astype(str).values, min_count=5)
train_companies_matrix = transform_text(train_data['companies'].astype(str).values, companies_dictionary)
test_companies_matrix = transform_text(test_data['companies'].astype(str).values, companies_dictionary)

In [16]:
def get_words_nltk(message, tokenizer = None):
    sentence = nltk.tokenize.sent_tokenize(message)
    tokens = nltk.tokenize.word_tokenize(sentence[0])
    tokens_without_punctuation = [token for token in tokens if token not in string.punctuation]
    stopwords_list = stopwords.words('english')
    tokens_without_stopwords = [token for token in tokens_without_punctuation if token not in stopwords_list]
    return tokens_without_stopwords

def create_dictionary_nltk(messages, min_count = 5, tokenizer = None):
    words_count = {}
    for s in messages:
        words = get_words_nltk(s, tokenizer)
        words_unique = list(dict.fromkeys(words))
        for w in words_unique:
            if w not in words_count:
                words_count[w] = 1
            else:
                words_count[w] += 1

    for word in list(words_count.keys()):
        if words_count[word] < min_count:
            del words_count[word]

    index = 1
    for word in list(words_count.keys()):
        words_count[word] = index
        index = index + 1

    return words_count

def transform_text_nltk(messages, word_dictionary, tokenizer = None):
    words_arr = np.zeros((len(messages), len(word_dictionary)))
    for i in range(words_arr.shape[0]):
        message_words = get_words_nltk(messages[i], tokenizer)
        message_indices = map(word_dictionary.get, message_words)
        for j in message_indices:
            if j is not None:
                words_arr[i, j-1] += 1
    return words_arr

In [18]:
overview_dictionary2 = create_dictionary_nltk(train_data['overview'], min_count = 20)
# print(overview_dictionary2)
train_overview_matrix2 = transform_text_nltk(train_data['overview'].astype(str).values, overview_dictionary2)
test_overview_matrix2 = transform_text_nltk(test_data['overview'].astype(str).values, overview_dictionary2)

In [19]:
print(train_overview_matrix2.shape)

(3928, 429)


In [20]:
train_data = pd.concat([train_data, pd.DataFrame(train_genre_matrix, columns = genre_dictionary.keys()), 
                                    pd.DataFrame(train_director_matrix, columns = director_dictionary.keys()), 
                                    pd.DataFrame(train_actors_matrix, columns = actors_dictionary.keys()),
                                    pd.DataFrame(train_companies_matrix, columns = companies_dictionary.keys()),
                                    pd.DataFrame(train_overview_matrix2, columns = overview_dictionary2.keys())], axis=1)

train_data.to_csv('../data/train_final2.csv', index = False)

In [21]:
train_data.shape

(3928, 1959)

In [22]:
test_data = pd.concat([test_data,   pd.DataFrame(test_genre_matrix, columns = genre_dictionary.keys()), 
                                    pd.DataFrame(test_director_matrix, columns = director_dictionary.keys()), 
                                    pd.DataFrame(test_actors_matrix, columns = actors_dictionary.keys()),
                                    pd.DataFrame(test_companies_matrix, columns = companies_dictionary.keys()),
                                    pd.DataFrame(test_overview_matrix2, columns = overview_dictionary2.keys())], axis=1)

test_data.to_csv('../data/test_final2.csv', index = False)
# test_data.head()

In [23]:
test_data.shape

(1684, 1959)

Print columns to file

In [24]:
import sys

orig_stdout = sys.stdout
f = open('columns2.txt', 'w')
sys.stdout = f

for col in train_data.columns:
    print(col)

sys.stdout = orig_stdout
f.close()
