In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval # using literal_eval we can make object from string
from sklearn.model_selection import train_test_split
import os
import spacy
import en_core_web_sm


In [3]:
# first database
movie_genre = pd.read_csv("../data/movieGenres/MovieGenre.csv", encoding="latin1")
# second database
credits = pd.read_csv("../data/movies/credits.csv", encoding="latin1")
keywords = pd.read_csv("../data/movies/keywords.csv", encoding="latin1")
movies_metadata = pd.read_csv('../data/movies/movies_metadata.csv', encoding="latin1")


print(movie_genre.shape)
print(credits.shape)
print(keywords.shape)
print(movies_metadata.shape)

# movies_metadata.head()

(40108, 6)
(45476, 3)
(46419, 2)
(45466, 24)


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# we need to rename column imdbId so it matches column name from movies_metadata.csv, for later join
cleaned_movie_genre = movie_genre.rename(columns={'imdbId': 'imdb_id'})
cleaned_movie_genre = cleaned_movie_genre.drop_duplicates(subset=['imdb_id'], keep=False)

cleaned_credits = credits.drop_duplicates(subset=['id'], keep=False)
cleaned_keywords = keywords.drop_duplicates(subset=['id'], keep=False)

cleaned_movies_metadata = movies_metadata.copy()
cleaned_movies_metadata = cleaned_movies_metadata[cleaned_movies_metadata['imdb_id']!='0']
cleaned_movies_metadata['imdb_id'] = cleaned_movies_metadata['imdb_id'].str.replace('tt','')
cleaned_movies_metadata['imdb_id'] = pd.to_numeric(cleaned_movies_metadata['imdb_id'], errors='coerce')
cleaned_movies_metadata = cleaned_movies_metadata[~np.isnan(cleaned_movies_metadata['imdb_id'])]
cleaned_movies_metadata['imdb_id'] = cleaned_movies_metadata['imdb_id'].astype(int)
cleaned_movies_metadata = cleaned_movies_metadata.drop_duplicates(subset=['imdb_id'], keep=False)

cleaned_movies_metadata['id'] = cleaned_movies_metadata['id'].astype(int)

all_tables_merged = pd.merge(cleaned_movies_metadata, cleaned_credits, on='id')
all_tables_merged = pd.merge(all_tables_merged, cleaned_keywords, on='id')
all_tables_merged = pd.merge(all_tables_merged, cleaned_movie_genre, on='imdb_id')

print(all_tables_merged.shape)

all_tables_merged.to_csv('../data/preprocessed_data.csv', index=False)
# all_tables_merged.head()

(37504, 32)


# Removig columns that are not required

In [None]:
# prepared_data = all_tables_merged.copy()
prepared_data = pd.read_csv('../data/preprocessed_data.csv');
prepared_data.drop('belongs_to_collection', 1, inplace=True)
prepared_data.drop('homepage', 1, inplace=True)
prepared_data.drop('poster_path', 1, inplace=True)
prepared_data.drop('video', 1, inplace=True)
prepared_data.drop('Poster', 1, inplace=True)
prepared_data.drop('original_title', 1, inplace=True)
prepared_data.drop('Title', 1, inplace=True)
print(prepared_data.shape)
prepared_data.to_csv('../data/preprocessed_data_2.csv', index=False)
# data_preparation.head()

# We will extract top 3 actors, comapanies, and director

In [17]:
def get_director(crew):
    for director in crew:
        if director['job'] == 'Director':
            return director['name']
    return

def get_actors(cast):
    if isinstance(cast, list):
        actors = [actor['name'] for actor in cast]
        if len(actors) >= 2:
            actors = actors[:2]
            return '|'.join(actors)
        return '|'.join(actors)

def get_production_companies(companies):
    if isinstance(companies, list):
        prod_companies = [company['name'] for company in companies]
        if len(prod_companies) >= 2:
            prod_companies = prod_companies[:2]
            return '|'.join(prod_companies)
        elif prod_companies == 0:
            return ''
        return '|'.join(prod_companies)


prepared_data.dropna(subset=['production_companies'], inplace=True)
directors = prepared_data['crew'].apply(literal_eval).apply(get_director)
top_2_actors = prepared_data['cast'].apply(literal_eval).apply(get_actors)
top_2_companies = prepared_data['production_companies'].apply(literal_eval).apply(get_production_companies)

# print(directors)
# print(top_2_actors)
# print(top_2_companies)
prepared_data['director'] = directors
prepared_data['actors'] = top_2_actors
prepared_data['companies'] = top_2_companies
prepared_data.to_csv('../data/preprocessed_data_3.csv', index=False)
# prepared_data.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,overview,popularity,production_companies,production_countries,...,vote_count,cast,crew,keywords,Imdb Link,IMDB Score,Genre,director,actors,companies
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,114709,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",...,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",http://www.imdb.com/title/tt114709,8.3,Animation|Adventure|Comedy,John Lasseter,Tom Hanks|Tim Allen,Pixar Animation Studios
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,113497,en,When siblings Judy and Peter discover an encha...,17.015539,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",...,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",http://www.imdb.com/title/tt113497,6.9,Action|Adventure|Family,Joe Johnston,Robin Williams|Jonathan Hyde,TriStar Pictures|Teitler Film
2,False,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,113228,en,A family wedding reignites the ancient feud be...,11.7129,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",...,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",http://www.imdb.com/title/tt113228,6.6,Comedy|Romance,Howard Deutch,Walter Matthau|Jack Lemmon,Warner Bros.|Lancaster Gate
3,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,114885,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",...,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",http://www.imdb.com/title/tt114885,5.7,Comedy|Drama|Romance,Forest Whitaker,Whitney Houston|Angela Bassett,Twentieth Century Fox Film Corporation
4,False,0,"[{'id': 35, 'name': 'Comedy'}]",11862,113041,en,Just when George Banks has recovered from his ...,8.387519,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",...,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",http://www.imdb.com/title/tt113041,5.9,Comedy|Family|Romance,Charles Shyer,Steve Martin|Diane Keaton,Sandollar Productions|Touchstone Pictures


In [3]:
included_columns = ['IMDB Score', 'overview', 'Genre', 'actors', 'runtime', 'director', 'budget', 'companies']
preprocessed_data_3 = pd.read_csv('../data/preprocessed_data_3.csv')
data_to_use = preprocessed_data_3[included_columns]
print(data_to_use.shape)
data_to_use = data_to_use.dropna()
print(data_to_use.shape)
data_to_use.head()


(37502, 8)
(27382, 8)


Unnamed: 0,IMDB Score,overview,Genre,actors,runtime,director,budget,companies
0,8.3,"Led by Woody, Andy's toys live happily in his ...",Animation|Adventure|Comedy,Tom Hanks|Tim Allen,81.0,John Lasseter,30000000,Pixar Animation Studios
1,6.9,When siblings Judy and Peter discover an encha...,Action|Adventure|Family,Robin Williams|Jonathan Hyde,104.0,Joe Johnston,65000000,TriStar Pictures|Teitler Film
2,6.6,A family wedding reignites the ancient feud be...,Comedy|Romance,Walter Matthau|Jack Lemmon,101.0,Howard Deutch,0,Warner Bros.|Lancaster Gate
3,5.7,"Cheated on, mistreated and stepped on, the wom...",Comedy|Drama|Romance,Whitney Houston|Angela Bassett,127.0,Forest Whitaker,16000000,Twentieth Century Fox Film Corporation
4,5.9,Just when George Banks has recovered from his ...,Comedy|Family|Romance,Steve Martin|Diane Keaton,106.0,Charles Shyer,0,Sandollar Productions|Touchstone Pictures


In [4]:
def get_words(message, tokenizer = None):
    if tokenizer:
        doc = tokenizer(message)
        return [i.text for i in doc]
    else:
        return message.lower().split('|')

def create_dictionary(messages, min_count = 5, tokenizer = None):
    words_count = {}
    for s in messages:
        words = get_words(s, tokenizer)
        words_unique = list(dict.fromkeys(words))
        for w in words_unique:
            if w not in words_count:
                words_count[w] = 1
            else:
                words_count[w] += 1

    for word in list(words_count.keys()):
        if words_count[word] < min_count:
            del words_count[word]

    index = 1
    for word in list(words_count.keys()):
        words_count[word] = index
        index = index + 1

    return words_count


def transform_text(messages, word_dictionary, tokenizer = None):
    words_arr = np.zeros((len(messages), len(word_dictionary)))
    for i in range(words_arr.shape[0]):
        message_words = get_words(messages[i], tokenizer)
        message_indices = map(word_dictionary.get, message_words)
        for j in message_indices:
            if j is not None:
                words_arr[i, j-1] += 1
    return words_arr


Split into train, validation and test set

In [5]:
train_dataset, test_dataset = train_test_split(data_to_use, test_size=0.3, random_state = 0)
valid_dataset, test_dataset = train_test_split(test_dataset, test_size=0.5, random_state = 0)

train_data = train_dataset.reset_index(drop=True)
valid_data = valid_dataset.reset_index(drop=True)
test_data = test_dataset.reset_index(drop=True)

print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

train_data.to_csv('../data/train.csv', index = False)
valid_data.to_csv('../data/valid.csv', index = False)
test_data.to_csv('../data/test.csv', index = False)

(19167, 8)
(4107, 8)
(4108, 8)


Now we can take train data and transform so we can apply algorithms

In [6]:
train_data = pd.read_csv('../data/train.csv')

genre_dictionary = create_dictionary(train_data['Genre'].astype(str).values, min_count=5)
train_genre_matrix = transform_text(train_data['Genre'].astype(str).values, genre_dictionary)
valid_genre_matrix = transform_text(valid_data['Genre'].astype(str).values, genre_dictionary)
test_genre_matrix = transform_text(test_data['Genre'].astype(str).values, genre_dictionary)

director_dictionary = create_dictionary(train_data['director'].astype(str).values, min_count=5)
train_director_matrix = transform_text(train_data['director'].astype(str).values, director_dictionary)
valid_director_matrix = transform_text(valid_data['director'].astype(str).values, director_dictionary)
test_director_matrix = transform_text(test_data['director'].astype(str).values, director_dictionary)

actors_dictionary = create_dictionary(train_data['actors'].astype(str).values, min_count = 5)
train_actors_matrix = transform_text(train_data['actors'].astype(str).values, actors_dictionary)
valid_actors_matrix = transform_text(valid_data['actors'].astype(str).values, actors_dictionary)
test_actors_matrix = transform_text(test_data['actors'].astype(str).values, actors_dictionary)

companies_dictionary = create_dictionary(train_data['companies'].astype(str).values, min_count=5)
train_companies_matrix = transform_text(train_data['companies'].astype(str).values, companies_dictionary)
valid_companies_matrix = transform_text(valid_data['companies'].astype(str).values, companies_dictionary)
test_companies_matrix = transform_text(test_data['companies'].astype(str).values, companies_dictionary)

tokenizer = en_core_web_sm.load(disable=["tagger", "parser","ner"])
overview_dictionary = create_dictionary(train_data['overview'], min_count = 20, tokenizer=tokenizer)
print('Size of dictionary: ', len(overview_dictionary))
train_overview_matrix = transform_text(train_data['overview'].astype(str).values, overview_dictionary)
valid_overview_matrix = transform_text(valid_data['overview'].astype(str).values, overview_dictionary)
test_overview_matrix = transform_text(test_data['overview'].astype(str).values, overview_dictionary)



Size of dictionary:  4953


In [None]:
train_data = pd.concat([train_data, pd.DataFrame(train_genre_matrix, columns = genre_dictionary.keys()), 
                                    pd.DataFrame(train_director_matrix, columns=director_dictionary.keys()), 
                                    pd.DataFrame(train_actors_matrix, columns = actors_dictionary.keys()),
                                    pd.DataFrame(train_companies_matrix, columns = companies_dictionary.keys()),
                                    pd.DataFrame(train_overview_matrix, columns = overview_dictionary.keys())], axis=1)

train_data.to_csv('../data/train_final.csv', index = False)
# train_data.head()

In [7]:
valid_data = pd.concat([valid_data, pd.DataFrame(valid_genre_matrix, columns = genre_dictionary.keys()), 
                                    pd.DataFrame(valid_director_matrix, columns=director_dictionary.keys()), 
                                    pd.DataFrame(valid_actors_matrix, columns = actors_dictionary.keys()),
                                    pd.DataFrame(valid_companies_matrix, columns = companies_dictionary.keys()),
                                    pd.DataFrame(valid_overview_matrix, columns = overview_dictionary.keys())], axis=1)

valid_data.to_csv('../data/valid_final.csv', index = False)
# valid_data.head()

In [8]:
test_data = pd.concat([test_data,   pd.DataFrame(test_genre_matrix, columns = genre_dictionary.keys()), 
                                    pd.DataFrame(test_director_matrix, columns=director_dictionary.keys()), 
                                    pd.DataFrame(test_actors_matrix, columns = actors_dictionary.keys()),
                                    pd.DataFrame(test_companies_matrix, columns = companies_dictionary.keys()),
                                    pd.DataFrame(test_overview_matrix, columns = overview_dictionary.keys())], axis=1)

test_data.to_csv('../data/test_final.csv', index = False)
# test_data.head()

Ucitavamo finalne podatke i radimo dalje

In [3]:
train_data = pd.read_csv('../data/train_final.csv')
# valid_data = pd.read_csv('../data/valid_final.csv')
# test_data = pd.read_csv('../data/test_final.csv')

In [4]:
# import sys

# orig_stdout = sys.stdout
# f = open('out.txt', 'w')
# sys.stdout = f

# for col in train_data.columns:
#     print(col)

# sys.stdout = orig_stdout
# f.close()
