##### Libraries

In [1]:
# For text preprocessing
import nltk
from nltk.corpus import stopwords
import string
from string import digits

# Generics
import pandas as pd
import json
import numpy as np

##### Utils

In [2]:
lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
stop_words = stopwords.words('english')

##### Functions

In [3]:
def preprocess_word(word):
    """
    INPUT : String
    OUTPUT: String lemmatized
    Apply standard cleaning operations of strings like:
    - Lemmatization
    - Stemming
    """
    word = lemmatizer.lemmatize(word)
    return stemmer.stem(word)

def preprocess_doc(document):
    
    """
    INPUT : String
    OUTPUT: List of strings 
    Apply standard cleaning operations to each of the words
    and return a list of the cleaned words
    """
    document = document.lower().translate(str.maketrans(' ', ' ', string.punctuation)).split()
    return [preprocess_word(x)  for x in document if x not in stop_words]

def get_first_genre(s):
    """
    INPUT : String
    OUTPUT: List of strings 
    
    Returns the first element of the list
    """
    return s[0]


def parse_string_to_json(s):
    """
    INPUT : A string that resembles a JSON doc(or a dictionary)
    OUTPUT: A list of the values of the dictionary
    
    Parse a string into a JSON and then return the values as a list
    """
    r = list(json.loads(s).values())
    if not r:
        return ['unknown']
    return r


##### Read the dataframes

In [4]:
# Files paths
filename          = './data/movies/plot_summaries.txt'
movie_names_file  = './data/movies/movie.metadata.tsv'


# Set correctly the headers of the dataset
movie_headers   = ['Wiki_ID', 'Freebase_ID', 'Name', 'Release_Date','box_office','runtime','languages','countries','genres']
summary_headers = ['Wiki_ID', 'Plot']


# Read the 2 DFS
df_filename = pd.read_csv(filename, delimiter='\t', header=None, names=summary_headers)
df_movie    = pd.read_csv(movie_names_file, delimiter='\t', header=None,names=movie_headers)

##### Merge the 2 dataframes on the Wiki_ID attribute

In [5]:
df = pd.merge(df_filename,df_movie, on='Wiki_ID')
# Drop the columns
cols_to_drop = [ 'Freebase_ID', 'Release_Date', 'box_office',  'runtime', 'languages', 'countries']
df = df.drop(cols_to_drop, axis=1)

# Re-index with consecutive numbers
df['Wiki_ID'] = list(range(df.shape[0]))

df.head(5)

Unnamed: 0,Wiki_ID,Plot,Name,genres
0,0,"Shlykov, a hard-working taxi driver and Lyosha...",Taxi Blues,"{""/m/07s9rl0"": ""Drama"", ""/m/03q4nz"": ""World ci..."
1,1,The nation of Panem consists of a wealthy Capi...,The Hunger Games,"{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"":..."
2,2,Poovalli Induchoodan is sentenced for six yea...,Narasimham,"{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action""..."
3,3,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,"{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""..."
4,4,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."


##### Remove movies that have a too short plot

In [6]:
df['len'] = df['Plot'].apply(len)

# Calculate quantiles (with .3 we will keep docs with more than 3000 words)
lower_length_bound = int(df['len'].quantile([.5]).to_list()[0])

# Drop the lower bounds of documents
df = df.drop(df[df.len < lower_length_bound].index)  

df = df.reset_index(drop=True)

df.head(5)

Unnamed: 0,Wiki_ID,Plot,Name,genres,len
0,1,The nation of Panem consists of a wealthy Capi...,The Hunger Games,"{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"":...",4559
1,2,Poovalli Induchoodan is sentenced for six yea...,Narasimham,"{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action""...",3099
2,3,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,"{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""...",4917
3,4,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",2425
4,5,The president is on his way to give a speech. ...,End Game,"{""/m/01jfsb"": ""Thriller"", ""/m/03btsm8"": ""Actio...",1937


In [7]:
df['Wiki_ID'] = np.arange(df.shape[0])

##### Apply preprocessing to the movies plot in order to obtain a clean corpus

In [8]:
df['Corpus'] = df['Plot'].apply(preprocess_doc)
df.head(5)

Unnamed: 0,Wiki_ID,Plot,Name,genres,len,Corpus
0,0,The nation of Panem consists of a wealthy Capi...,The Hunger Games,"{""/m/03btsm8"": ""Action/Adventure"", ""/m/06n90"":...",4559,"[nation, panem, consist, wealthi, capitol, twe..."
1,1,Poovalli Induchoodan is sentenced for six yea...,Narasimham,"{""/m/04t36"": ""Musical"", ""/m/02kdv5l"": ""Action""...",3099,"[pooval, induchoodan, sentenc, six, year, pris..."
2,2,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,"{""/m/06qm3"": ""Screwball comedy"", ""/m/01z4y"": ""...",4917,"[lemon, drop, kid, new, york, citi, swindler, ..."
3,3,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...",2425,"[seventhday, adventist, church, pastor, michae..."
4,4,The president is on his way to give a speech. ...,End Game,"{""/m/01jfsb"": ""Thriller"", ""/m/03btsm8"": ""Actio...",1937,"[presid, way, give, speech, travel, man, show,..."


##### Fix the genres attribute by parsing it to JSON and extract the various genres

In [9]:
df['genres_all']   = df['genres'].apply(parse_string_to_json)
df['genres']       = df['genres_all'].apply(get_first_genre)
df.head(5)

Unnamed: 0,Wiki_ID,Plot,Name,genres,len,Corpus,genres_all
0,0,The nation of Panem consists of a wealthy Capi...,The Hunger Games,Action/Adventure,4559,"[nation, panem, consist, wealthi, capitol, twe...","[Action/Adventure, Science Fiction, Action, Dr..."
1,1,Poovalli Induchoodan is sentenced for six yea...,Narasimham,Musical,3099,"[pooval, induchoodan, sentenc, six, year, pris...","[Musical, Action, Drama, Bollywood]"
2,2,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,Screwball comedy,4917,"[lemon, drop, kid, new, york, citi, swindler, ...","[Screwball comedy, Comedy]"
3,3,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,Crime Fiction,2425,"[seventhday, adventist, church, pastor, michae...","[Crime Fiction, Drama, Docudrama, World cinema..."
4,4,The president is on his way to give a speech. ...,End Game,Thriller,1937,"[presid, way, give, speech, travel, man, show,...","[Thriller, Action/Adventure, Action, Drama]"


In [10]:
df.head(5)

Unnamed: 0,Wiki_ID,Plot,Name,genres,len,Corpus,genres_all
0,0,The nation of Panem consists of a wealthy Capi...,The Hunger Games,Action/Adventure,4559,"[nation, panem, consist, wealthi, capitol, twe...","[Action/Adventure, Science Fiction, Action, Dr..."
1,1,Poovalli Induchoodan is sentenced for six yea...,Narasimham,Musical,3099,"[pooval, induchoodan, sentenc, six, year, pris...","[Musical, Action, Drama, Bollywood]"
2,2,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,Screwball comedy,4917,"[lemon, drop, kid, new, york, citi, swindler, ...","[Screwball comedy, Comedy]"
3,3,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,Crime Fiction,2425,"[seventhday, adventist, church, pastor, michae...","[Crime Fiction, Drama, Docudrama, World cinema..."
4,4,The president is on his way to give a speech. ...,End Game,Thriller,1937,"[presid, way, give, speech, travel, man, show,...","[Thriller, Action/Adventure, Action, Drama]"


In [11]:
df.drop(columns=['len'])

Unnamed: 0,Wiki_ID,Plot,Name,genres,Corpus,genres_all
0,0,The nation of Panem consists of a wealthy Capi...,The Hunger Games,Action/Adventure,"[nation, panem, consist, wealthi, capitol, twe...","[Action/Adventure, Science Fiction, Action, Dr..."
1,1,Poovalli Induchoodan is sentenced for six yea...,Narasimham,Musical,"[pooval, induchoodan, sentenc, six, year, pris...","[Musical, Action, Drama, Bollywood]"
2,2,"The Lemon Drop Kid , a New York City swindler,...",The Lemon Drop Kid,Screwball comedy,"[lemon, drop, kid, new, york, citi, swindler, ...","[Screwball comedy, Comedy]"
3,3,Seventh-day Adventist Church pastor Michael Ch...,A Cry in the Dark,Crime Fiction,"[seventhday, adventist, church, pastor, michae...","[Crime Fiction, Drama, Docudrama, World cinema..."
4,4,The president is on his way to give a speech. ...,End Game,Thriller,"[presid, way, give, speech, travel, man, show,...","[Thriller, Action/Adventure, Action, Drama]"
...,...,...,...,...,...,...
21101,21101,Lucy is working as a dancer in a sleazy strip...,I Don't Want to Be Born,Horror,"[luci, work, dancer, sleazi, strip, joint, sta...",[Horror]
21102,21102,Twenty-something Eun-mo listens to a taxi driv...,Paju,Romantic drama,"[twentysometh, eunmo, listen, taxi, driver, dr...","[Romantic drama, Romance Film, Drama, World ci..."
21103,21103,"In 1928 Hollywood, director Leo Andreyev look...",The Last Command,Silent film,"[1928, hollywood, director, leo, andreyev, loo...","[Silent film, Indie, Black-and-white, Period p..."
21104,21104,"Abdur Rehman Khan , a middle-aged dry fruit se...",Kabuliwala,Drama,"[abdur, rehman, khan, middleag, dri, fruit, se...",[Drama]


##### Serialize the dataframe into a pickle object

In [12]:
path   = "./preprocessed/"
df.to_pickle(path+"df_preprocessed.pkl")