In [None]:
pip install tmdbsimple

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tmdbsimple
  Downloading tmdbsimple-2.9.1-py3-none-any.whl (38 kB)
Installing collected packages: tmdbsimple
Successfully installed tmdbsimple-2.9.1


In [None]:
import tmdbsimple as tmdb

In [None]:
# requires api key that needs project url and personal info etc.
# not sure if i want to continue
# sticking with existing dataset for now...

# **Building corpus**

## Reading data

The dataset is taken from Kaggle: "Wikipedia Movie Plots" 
https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots


In [None]:
# if working in colab
import pandas as pd
from google.colab import files
file = files.upload()
df = pd.read_csv("wiki_movie_plots_deduped.csv")
df.head()

In [None]:
# If working in local:
import pandas as pd
df = pd.read_csv("wiki_movie_plots_deduped.csv")
df.head()

In [None]:
df.columns

Index(['Release Year', 'Title', 'Origin/Ethnicity', 'Director', 'Cast',
       'Genre', 'Wiki Page', 'Plot'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


# **Preprocessing**

Check file structure. Notice that there are null values in "cast" and there are some unknown values in other columns. Replace null with empty string for now and deal with unknown values later (unknown and Unknown...)

In [None]:
data_cols = ['Title', 'Origin/Ethnicity', 'Director', 'Cast', 'Genre', 'Plot']
collection = df[data_cols]
collection = collection.apply(lambda x: x.str.lower())
# not sure if "unknown values" should be replaced with empty string...
collection = collection.fillna('')

In [None]:
unknown = {
    col: collection.loc[collection[col]=='unknown',col].count()
    for col in collection.columns
}
print(unknown) # relatively small proportion.. probably won't affect the result too much

{'Title': 2, 'Origin/Ethnicity': 0, 'Director': 1124, 'Cast': 1, 'Genre': 6083, 'Plot': 0}


## Tokenization


In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords

import re
import string # for punctuations


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Initialize a tokenizer
tokenizer = WhitespaceTokenizer()

In [None]:
# Tokenize columns 

tokenized_df = pd.DataFrame()
for col in collection.columns:
    pre_tokenize = collection[col].apply(lambda row : row.replace("\n"," ").replace(
        "'s"," ").translate(str.maketrans(dict.fromkeys(string.punctuation," "))))
    new_col_name = col+'_Tokenized'
    tokenized_df[new_col_name] = pre_tokenize
    tokenized_df[new_col_name] = tokenized_df[new_col_name].apply(lambda x: tokenizer.tokenize(x))

In [None]:
# Combining tokens from different fields
tokenized_df['Tokens'] = tokenized_df['Title_Tokenized'] + tokenized_df['Origin/Ethnicity_Tokenized'] + tokenized_df['Director_Tokenized'] + tokenized_df['Cast_Tokenized'] + tokenized_df['Genre_Tokenized'] + tokenized_df['Plot_Tokenized']

In [None]:
tokenized_df.columns

Index(['Title_Tokenized', 'Origin/Ethnicity_Tokenized', 'Director_Tokenized',
       'Cast_Tokenized', 'Genre_Tokenized', 'Plot_Tokenized', 'Tokens'],
      dtype='object')

In [None]:
tokenized_df['Tokens'].apply(len)

0         90
1         96
2         83
3        162
4        152
        ... 
34881    618
34882     22
34883     79
34884    221
34885     63
Name: Tokens, Length: 34886, dtype: int64

In [None]:
# Removing stopwords
stop = stopwords.words('english')
tokenized_df['Tokens'] = tokenized_df['Tokens'].apply(lambda x: [tk for tk in x if tk not in stop])

In [None]:
tokenized_df['Tokens'].apply(len)

0         55
1         50
2         47
3         93
4         79
        ... 
34881    366
34882     18
34883     50
34884    137
34885     43
Name: Tokens, Length: 34886, dtype: int64

In [None]:
# len(set(stopwords.words('english')))

In [None]:
# Document - tokens dictionary (or df?)
# doc_tokens = dict()
#for i in range(len(tokenized_df)):
#    doc_tokens[i] = token 

In [None]:
# v2 using regular expressions
# save for later
# tokenized_df_v2 = pd.DataFrame()
# for col in collection.columns:
#     pre_tokenize = collection[col].apply(lambda row : row.replace("\n"," ").replace(
#         "'s"," ").translate(str.maketrans(dict.fromkeys(string.punctuation," "))))
#     new_col_name = col+'_Tokenized'
#     pre_tokenize = pre_tokenize.apply(lambda x: re.sub(r'[\W_]', ' ', x))
#     pre_tokenize = pre_tokenize.apply(lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x))
#     pre_tokenize = pre_tokenize.apply(lambda x: re.sub(r'\^[a-zA-Z]\s+', ' ', x))
#     pre_tokenize = pre_tokenize.apply(lambda x: re.sub(r'\s+', ' ', x, flags=re.I))
#     pre_tokenize = pre_tokenize.apply(lambda x: re.sub(r'^b\s+', '', x))
#     tokenized = pre_tokenize.apply(lambda x: re.sub(r'[^\x00-\x7F]+', ' ', x))
#     tokenized_df_v2[new_col_name] = tokenized

# re tokenizes strings so needs a different method to combine all the tokens together

## Stemming

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [None]:
tokenized_df['Stemmed'] = tokenized_df['Tokens'].apply(lambda x: [stemmer.stem(token) for token in x])

In [None]:
tokenized_df['Stemmed'].apply(len)

0         55
1         50
2         47
3         93
4         79
        ... 
34881    366
34882     18
34883     50
34884    137
34885     43
Name: Stemmed, Length: 34886, dtype: int64

In [None]:
tokenized_df['Stemmed'][:2]

0    [kansa, saloon, smasher, american, unknown, un...
1    [love, light, moon, american, unknown, unknown...
Name: Stemmed, dtype: object

In [None]:
tokenized_df

Unnamed: 0,Title_Tokenized,Origin/Ethnicity_Tokenized,Director_Tokenized,Cast_Tokenized,Genre_Tokenized,Plot_Tokenized,Tokens,Stemmed
0,"[kansas, saloon, smashers]",[american],[unknown],[],[unknown],"[a, bartender, is, working, at, a, saloon, ser...","[kansas, saloon, smashers, american, unknown, ...","[kansa, saloon, smasher, american, unknown, un..."
1,"[love, by, the, light, of, the, moon]",[american],[unknown],[],[unknown],"[the, moon, painted, with, a, smiling, face, h...","[love, light, moon, american, unknown, unknown...","[love, light, moon, american, unknown, unknown..."
2,"[the, martyred, presidents]",[american],[unknown],[],[unknown],"[the, film, just, over, a, minute, long, is, c...","[martyred, presidents, american, unknown, unkn...","[martyr, presid, american, unknown, unknown, f..."
3,"[terrible, teddy, the, grizzly, king]",[american],[unknown],[],[unknown],"[lasting, just, 61, seconds, and, consisting, ...","[terrible, teddy, grizzly, king, american, unk...","[terribl, teddi, grizzli, king, american, unkn..."
4,"[jack, and, the, beanstalk]",[american],"[george, s, fleming, edwin, s, porter]",[],[unknown],"[the, earliest, known, adaptation, of, the, cl...","[jack, beanstalk, american, george, fleming, e...","[jack, beanstalk, american, georg, fleme, edwi..."
...,...,...,...,...,...,...,...,...
34881,"[the, water, diviner]",[turkish],"[director, russell, crowe]","[director, russell, crowe, cast, russell, crow...",[unknown],"[the, film, begins, in, 1919, just, after, wor...","[water, diviner, turkish, director, russell, c...","[water, divin, turkish, director, russel, crow..."
34882,"[çalgı, çengi, i̇kimiz]",[turkish],"[selçuk, aydemir]","[ahmet, kural, murat, cemcir]",[comedy],"[two, musicians, salih, and, gürkan, described...","[çalgı, çengi, i̇kimiz, turkish, selçuk, aydem...","[çalgı, çengi, i̇kimiz, turkish, selçuk, aydem..."
34883,"[olanlar, oldu]",[turkish],"[hakan, algül]","[ata, demirer, tuvana, türkay, ülkü, duru]",[comedy],"[zafer, a, sailor, living, with, his, mother, ...","[olanlar, oldu, turkish, hakan, algül, ata, de...","[olanlar, oldu, turkish, hakan, algül, ata, de..."
34884,"[non, transferable]",[turkish],"[brendan, bradley]","[youtubers, shanna, malcolm, shira, lazar, sar...","[romantic, comedy]","[the, film, centres, around, a, young, woman, ...","[non, transferable, turkish, brendan, bradley,...","[non, transfer, turkish, brendan, bradley, you..."


# **Building index & tf-idf weights**

## Building inverted index & frequency counters

Using tokens from all fields in the dataframe at the moment. Might consider separate out specific fields

In [None]:
from collections import Counter

In [None]:
# Building word-count dictionary for each document - "bag of words"
doc_tokens_dict = dict(zip(tokenized_df.index, tokenized_df['Stemmed']))
for k in doc_tokens_dict.keys():
    words = doc_tokens_dict[k]
    counter = Counter(words)
    doc_tokens_dict[k] = counter

In [None]:
vocab = Counter()
for ctr in doc_tokens_dict.values():
    vocab += ctr

In [None]:
# number of distinct words in the collection
len(vocab)

125652

In [None]:
sorted(vocab.items(), key=lambda x: x[1],reverse=True)[0:10]

[('kill', 33940),
 ('get', 33557),
 ('find', 32768),
 ('take', 30952),
 ('one', 30675),
 ('tell', 26790),
 ('love', 25181),
 ('leav', 25077),
 ('father', 23811),
 ('back', 22968)]

In [None]:
# Storing the term-freq in the dataframe too
tokenized_df['Term_Freq'] = pd.Series(doc_tokens_dict.values())

In [None]:
# Creating inverted index
# Storing BOTH num of occurred document + the document id for now

inverted_index = dict()

# storing the document id in case of future use
ii_with_ids = dict()

for i, val in enumerate(doc_tokens_dict.values()):
    for key in val.keys():
        if key in inverted_index.keys():
            inverted_index[key] += 1
        else:
            inverted_index[key] = 1
        if key not in ii_with_ids.keys():
            ii_with_ids[key] = []
        ii_with_ids[key].append(i)
            
    

In [None]:
# top 10 (stemmed) words that appeared in the most documents
sorted(inverted_index.items(), key=lambda x: x[1],reverse=True)[0:10]

[('american', 18014),
 ('one', 16434),
 ('take', 16430),
 ('find', 16086),
 ('get', 15724),
 ('leav', 13229),
 ('love', 13087),
 ('tri', 12814),
 ('back', 12606),
 ('make', 12569)]

## Calculating tf-idf weights

As mentioned in HW1: let the `tfidf` of term _t_ in document _d_ be:
```
tfidf(t, d) = log(count(t, d) + 1) * log(N / df(t))
```

In [None]:
import numpy as np

In [None]:
N_doc = len(tokenized_df)

In [None]:
tf_idf_dict = dict()
    
for ind, ctr in enumerate(doc_tokens_dict.values()):
    val_dict = dict()
    for key in ctr.keys():
        val_dict[key] = np.log(ctr[key]+1)*np.log(N_doc/inverted_index[key])
    tf_idf_dict[ind] = val_dict

In [None]:
# Storing the tf-idf values in the dataframe too
tokenized_df['tfidf'] = pd.Series(tf_idf_dict.values())

In [None]:
tokenized_df.head(1)

# **Create query functions**

Creating BM25 scores for each document in each query.

For each term i in each query, compute: <br>
$log\frac{(r_i+0.5)/(R-r_i+0.5)}{(n_i-r_i+0.5)/(N-n_i-R+r_i+0.5)}\cdot \frac{(k_1+1)f_i}{K+f_i}\cdot\frac{(k_2+1)qf_i}{(k_2+qf_i)}$ <br>
where $K = k_1((1-b)+b\cdot\frac{doc-length}{avg-doc-length})$ and k1=1.2, k2=1000, b=0.75 <br>
Adding 1 inside the log avoid negative values for terms with very high document frequency

In [None]:
# Computing length of each document and the average length
tokenized_df['Length'] = tokenized_df['Tokens'].apply(len)
avg_dl = tokenized_df['Length'].mean()

In [None]:
# BM25 parameters for documents scoring
k1 = 1.2
b = 0.75
k2 = 1000

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import numpy as np

stop_words = set(stopwords.words('english'))

def start_query(query):
  solving_query = []
  for item in query:
    item = item.translate(str.maketrans('', '', string.punctuation))
    filtered_sentence = []
    for w in word_tokenize(item):
      if w not in stop_words:
        filtered_sentence.append(w)
    #ps = PorterStemmer()
    #for w in filtered_sentence:
    #  w = ps.stem(w)
    solving_query.append(filtered_sentence)
  df_query = pd.DataFrame({'Origin':[]})
  for ele in range(len(solving_query)):
    df_query.loc[ele] = [solving_query[ele]]
  
  df_query['Stemmed'] = df_query['Origin'].apply(lambda x: [stemmer.stem(token) for token in x])
  
  for ele in range(len(df_query)):
    final_result = {}
    result = {}
    result_set = {}

    q_N = len(df_query['Stemmed'][ele])
    wordcount_q = Counter(df_query['Stemmed'][ele])
    
    df_new_query = []
    for k in wordcount_q.keys():
      df_new_query.append(k)

    for i in range(N_doc):
      score = 0
      for words in df_new_query:
        if words in tokenized_df['Term_Freq'][i].keys():
          doc_length = tokenized_df['Length'][i]
          K = k1 * ((1 - b) + b * (doc_length / avg_dl))
          n_i = inverted_index[words]
          fi = tokenized_df['Term_Freq'][i][words]
          qfi = wordcount_q[words]
          sec_1 = (N_doc - n_i + 0.5) / (n_i + 0.5)
          sec_2 = ((k1 + 1) * fi) / (K + fi)
          sec_3 = ((k2 + 1) * qfi) / (k2 + qfi)
          total = np.log(sec_1 * sec_2 * sec_3)
        else:
          total = 0
        score = score + total
      result[i] = score
    result_set = sorted(result.items(), key = lambda x: x[1], reverse = True)
    final_result[ele] = result_set[:10] # You can adjust the number of returned results
    print('------------------------' + 'query ' + str(ele + 1) + '------------------------')
    for k,v in final_result.items():
      for doc in v:
        title_name = "《" + str(df['Title'][doc[0]]) + "》 The score: " + str(doc[1])
        print(title_name) 
        print(df['Plot'][doc[0]])
        print('\n')
    print("\n")

# **Start query**

In [None]:
# You can do a internal query in this notebook or by uploading a txt file with query statements separated by blank lines.

In [None]:
import pandas as pd
from google.colab import files

query = []
while True:
  mode = input("1:internal query || 2:uploading txt file    ")
  if len(mode) == 1 and int(mode) == 1:
    while True:
      single_query = input("Please input your query:    ")
      query.append(single_query)
      finish_or_not = input("Continue? Y/N    ")
      if finish_or_not != 'Y':
        break
    break
  elif len(mode) == 1 and int(mode) == 2:
    txt_file = files.upload()
    with open('query.txt') as fp:
      query = [p.strip() for p in fp.read().split('\n\n')]
    break
  else:
    print("Warning: you haven't entered any queries! Do you need to go back and re-enter? Y/N    ")
    warning = input()
    if warning != 'Y':
      break

print("\n")
start_query(query)

# **Evaluation**

In [None]:
# Calculate MAP@10

from pandas.core.common import count_not_none
qrels = {
        1:[1, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        2:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        3:[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        4:[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        5:[0, 0, 1, 1, 1, 1, 0, 0, 1, 0],
        6:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        7:[0, 1, 1, 1, 1, 1, 1, 1, 0, 1],
        8:[1, 0, 1, 0, 1, 1, 1, 0, 0, 0]
}

all = 0
for k,v in qrels.items():
  count = 0
  flag = 0
  tmp_all = 0
  for ele in v:
    count = count + 1
    if ele != 0:
      flag = flag + 1
      accuracy = flag / count
      tmp_all = tmp_all + accuracy
  if flag != 0:
    all = all + tmp_all / flag
final = all / len(qrels)
print('The accuracy: ' + str(final))

The accuracy: 0.8276128472222222
