In [None]:
!pip install --upgrade --no-cache-dir gdown

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!gdown --id 1agvSmbqNHbrWZ64dxInod04AV6phmamH

Downloading...
From: https://drive.google.com/uc?id=1agvSmbqNHbrWZ64dxInod04AV6phmamH
To: /content/wiki_movie_plots_deduped.csv
100% 81.2M/81.2M [00:00<00:00, 103MB/s]


In [None]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!nvidia-smi 

Thu Feb 23 03:12:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    29W /  70W |   2510MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import pandas as pd
import time
from tqdm import tqdm
import seaborn as sns
import numpy as np
from textblob import TextBlob
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('msmarco-distilbert-base-dot-prod-v3')

## Load our cross-encoder. Use fast tokenizer to speed up the tokenization
from sentence_transformers import CrossEncoder
cross_model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6', max_length=512)

In [None]:
data = pd.read_csv('/content/wiki_movie_plots_deduped.csv',memory_map=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34886 entries, 0 to 34885
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Release Year      34886 non-null  int64 
 1   Title             34886 non-null  object
 2   Origin/Ethnicity  34886 non-null  object
 3   Director          34886 non-null  object
 4   Cast              33464 non-null  object
 5   Genre             34886 non-null  object
 6   Wiki Page         34886 non-null  object
 7   Plot              34886 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.1+ MB


In [None]:
import nltk
from nltk import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

True

In [None]:
# this function returns a list of tokenized and stemmed words of any text
def get_tokenized_list(doc_text):
    tokens = nltk.word_tokenize(doc_text)
    return tokens

# This function will performing stemming on tokenized words
def word_stemmer(token_list):
  ps = nltk.stem.PorterStemmer()
  stemmed = []
  for words in token_list:
    stemmed.append(ps.stem(words))
  return stemmed

# Function to remove stopwords from tokenized word list
stop_words = set(stopwords.words('english'))

def remove_stopwords(doc_text):
  cleaned_text = []
  for words in doc_text:
    if words not in stop_words:
      cleaned_text.append(words)
  return cleaned_text

def lemmatizer(lem_text):
    wordnet_lemmatizer = WordNetLemmatizer() 
    lematizer = []
    for word in lem_text:
        lematizer.append(wordnet_lemmatizer.lemmatize(word))
    return lematizer

In [None]:
cleaned_corpus = []
token_list = []

for doc in data['Plot']:
  tokens = get_tokenized_list(doc)
  token_list.append(tokens)

  doc_text = remove_stopwords(tokens)
  doc_text  = word_stemmer(doc_text)
  doc_text = lemmatizer(doc_text)
  doc_text = ' '.join(doc_text)
  cleaned_corpus.append(doc_text)

In [None]:
data['Tokenized'] =  cleaned_corpus

In [None]:
data.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,Tokenized
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...","a bartend work saloon , serv drink custom . af..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...","the moon , paint smile face hang park night . ..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...","the film , minut long , compos two shot . in f..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,"last 61 second consist two shot , first shot s..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,"the earliest known adapt classic fairytal , fi..."


In [None]:
import gc

df_token = data[['Title','Tokenized']]

gc.collect()

33

In [None]:
df_token.dropna(inplace=True)
df_token.drop_duplicates(subset=['Tokenized'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [None]:
!pip install faiss-gpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import faiss
encoded_data = model.encode(df_token.Tokenized.tolist())
encoded_data = np.asarray(encoded_data.astype('float32'))
index = faiss.IndexIDMap(faiss.IndexFlatIP(768))
index.add_with_ids(encoded_data, np.array(range(0, len(df_token))))
faiss.write_index(index, 'movie_plot.index')

In [None]:
encoded_data.shape

(33869, 768)

In [None]:
def fetch_movie_info(dataframe_idx, score):
    info = df_token.iloc[dataframe_idx]
    meta_dict = {}
    meta_dict['Title'] = info['Title']
    meta_dict['Tokenized'] = info['Tokenized']
    meta_dict['Score'] = score
    return meta_dict

def search(query, top_k, index, model):
    t=time.time()
    query_vector = model.encode([query])
    # top_score = util.cos_sim(query_vector)
    top_k = index.search(query_vector, top_k)
    print('>>>> Results in Total Time: {}'.format(time.time()-t))
    top_k_ids = top_k[1].tolist()[0]
    top_k_ids = list(np.unique(top_k_ids))
    top_k_score = top_k[0].tolist()[0]
    top_k_score = list(np.unique(top_k_score))
    results =  [fetch_movie_info(idx,score) for idx,score in zip(top_k_ids,top_k_score)]

    return results

In [None]:
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator

def cross_score(model_inputs):
    scores = cross_model.predict(model_inputs)
    return scores

def cross_ranked(query, results):
  model_inputs = [[query,item['Tokenized']] for item in results]
  scores = cross_score(model_inputs)
  #Sort the scores in decreasing order
  ranked_results = [{'Title': inp['Title'], 'Score': score} for inp, score in zip(results, scores)]
  ranked_results = sorted(ranked_results, key=lambda x: x['Score'], reverse=True)
  
  return ranked_results

In [None]:
!pip install bert-score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from bert_score import score

def bert_precision(query, results):

  ref=[query]

  ranked_results_bert = []

  for cand in results:
      P, R, F1 = score([cand['Tokenized']], ref, lang='en')
      ranked_results_bert.append({'Title': cand['Title'], 'Score': P.numpy()[0]})

  #Sort the scores in decreasing order
  ranked_results_bert = sorted(ranked_results_bert, key=lambda x: x['Score'], reverse=True)
  
  return ranked_results_bert

In [None]:
import faiss

In [None]:
from pprint import pprint

query="Artificial Intelligence based action movie"
results=search(query, top_k=15, index=index, model=model)
ranked_results = cross_ranked(query, results)
ranked_results_bert_P = bert_precision(query, results)

>>>> Results in Total Time: 0.047153472900390625


Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaM

In [None]:
final_results = pd.DataFrame()
final_results['cross_encoder'] = [[item['Title'], item['Score']] for item in ranked_results]
final_results['Precision'] = [[item['Title'], item['Score']] for item in ranked_results_bert_P]
final_results

Unnamed: 0,cross_encoder,Precision
0,"[The Kentucky Fried Movie, 0.00040342272]","[Parasuram, 0.7586428]"
1,"[The Ugly Swans, 0.00031254787]","[The Ugly Swans, 0.73996365]"
2,"[ G-Force, 0.00027860518]","[Indian, 0.7306156]"
3,"[Twist, 0.00027031262]","[Twist, 0.7286649]"
4,"[Indian, 0.00018003744]","[Killers from Space, 0.71564215]"
5,"[Robot Overlords, 0.00017212014]","[The Kentucky Fried Movie, 0.7144465]"
6,"[Starship Invasions, 0.00017109059]","[Black Sunday, 0.71343756]"
7,"[The Day the Earth Stood Still, 0.00016997707]","[Ra.One, 0.7011993]"
8,"[Parasuram, 0.00016953824]","[Nemesis, 0.699366]"
9,"[Species II, 0.00016822152]","[Starship Invasions, 0.69888073]"


In [None]:
from pprint import pprint

query="movie about romance and pain of separation"
results=search(query, top_k=15, index=index, model=model)
ranked_results = cross_ranked(query, results)
ranked_results_bert_P = bert_precision(query, results)


>>>> Results in Total Time: 0.054932594299316406


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaM

In [None]:
final_results = pd.DataFrame()
final_results['cross_encoder'] = [[item['Title'], item['Score']] for item in ranked_results]
final_results['Precision'] = [[item['Title'], item['Score']] for item in ranked_results_bert_P]
final_results

Unnamed: 0,cross_encoder,Precision
0,"[Brothers, 0.00027461487]","[Brothers, 0.82244766]"
1,"[Aandhali Koshimbir, 0.000175142]","[Samooham, 0.796643]"
2,"[Strings of Passion, 0.00017335973]","[Strings of Passion, 0.7666638]"
3,"[Camel Safari, 0.00017219286]","[Cold Heaven, 0.75747937]"
4,"[Paanch Adhyay, 0.00016943918]","[Paanch Adhyay, 0.75440115]"
5,"[2046, 0.00016937974]","[Swapaanam, 0.7535254]"
6,"[Obaltan, 0.00016630845]","[You and I, 0.7501334]"
7,"[Samooham, 0.00016377438]","[Camel Safari, 0.7445241]"
8,"[U Me Aur Hum, 0.00016309899]","[Kannum Kannum, 0.7349776]"
9,"[Cold Heaven, 0.00016250908]","[Aandhali Koshimbir, 0.719308]"


In [None]:
from pprint import pprint

query="post apocalyptic movies"
results=search(query, top_k=15, index=index, model=model)
ranked_results = cross_ranked(query, results)
ranked_results_bert_P = bert_precision(query, results)

>>>> Results in Total Time: 0.04893994331359863


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaM

In [None]:
final_results = pd.DataFrame()
final_results['cross_encoder'] = [[item['Title'], item['Score']] for item in ranked_results]
final_results['Precision'] = [[item['Title'], item['Score']] for item in ranked_results_bert_P]
final_results

Unnamed: 0,cross_encoder,Precision
0,"[The Bible: In the Beginning, 0.0022279434]","[Rasta, 0.8213908]"
1,[Kamen Rider × Kamen Rider W & Decade: Movie W...,"[Who Done It?, 0.7741748]"
2,"[Rasta, 0.0006663714]","[The Shriek of Araby, 0.7636542]"
3,"[Waxwork II: Lost in Time, 0.00036699412]","[Hunt Angels, 0.75823027]"
4,"[Vikingdom, 0.00035181877]","[What a Night!, 0.7503693]"
5,"[Slaughterhouse-Five, 0.0003320172]","[Dangerous Parking, 0.7428129]"
6,"[Dangerous Parking, 0.0003142066]","[The Bible: In the Beginning, 0.7363828]"
7,"[Real Men, 0.00022772544]","[Alien Nation: Dark Horizon, 0.73367196]"
8,"[My Summer Story, 0.00018868317]","[My Summer Story, 0.7304483]"
9,"[Future War, 0.00018815967]","[Future War, 0.7282488]"


In [None]:
from pprint import pprint

query="World war 2 movies"
results=search(query, top_k=15, index=index, model=model)
ranked_results = cross_ranked(query, results)
ranked_results_bert_P = bert_precision(query, results)

>>>> Results in Total Time: 0.047971487045288086


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaM

In [None]:
final_results = pd.DataFrame()
final_results['cross_encoder'] = [[item['Title'], item['Score']] for item in ranked_results]
final_results['Precision'] = [[item['Title'], item['Score']] for item in ranked_results_bert_P]
final_results

Unnamed: 0,cross_encoder,Precision
0,"[Fantasy Mission Force, 0.89068276]","[The Beach Party at the Threshold of Hell, 0.7..."
1,"[Jayne Mansfield's Car, 0.13887179]","[Back at the Front, 0.756638]"
2,"[Back at the Front, 0.09029461]","[Desert Bloom, 0.72757775]"
3,"[White Christmas, 0.05118225]","[When Taekwondo Strikes, 0.7274536]"
4,"[Midway, 0.02662519]","[No Time to Die, 0.7273537]"
5,"[When Taekwondo Strikes, 0.0036365238]","[Midway, 0.71536064]"
6,"[Kings Go Forth, 0.0014568501]","[Jayne Mansfield's Car, 0.71193355]"
7,"[Frankenstein vs. Baragon, 0.0010568157]","[Fantasy Mission Force, 0.69983625]"
8,"[Men Must Fight, 0.0010209082]","[The Desert Fox, 0.69581574]"
9,"[The White Cliffs of Dover, 0.00095018256]","[Frankenstein vs. Baragon, 0.69374275]"


In [None]:
from pprint import pprint

query="Movie about treasure hunters"
results=search(query, top_k=15, index=index, model=model)
ranked_results = cross_ranked(query, results)
ranked_results_bert_P = bert_precision(query, results)

>>>> Results in Total Time: 0.04215431213378906


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaM

In [None]:
final_results = pd.DataFrame()
final_results['cross_encoder'] = [[item['Title'], item['Score']] for item in ranked_results]
final_results['Precision'] = [[item['Title'], item['Score']] for item in ranked_results_bert_P]
final_results

Unnamed: 0,cross_encoder,Precision
0,"[Wind Across the Everglades, 0.0002499526]","[Panther Girl of the Kongo, 0.79587257]"
1,"[Armour of God II: Operation Condor, 0.0002292...","[Sekigahara, 0.7773442]"
2,"[Boxcar Bertha, 0.00022597713]","[Boxcar Bertha, 0.77551615]"
3,"[Brotherhood of Blood, 0.0001933267]","[Getting Even, 0.76187]"
4,"['Gator Bait, 0.00018620535]","[Wind Across the Everglades, 0.758898]"
5,"[Headin' South, 0.00017327626]","[Headin' South, 0.75559574]"
6,"[Sekigahara, 0.00016874041]","[April Folly, 0.7540952]"
7,"[Catalina Caper, 0.00016619384]","[The Deceivers, 0.74791384]"
8,"[The Turkish Gambit, 0.00016225206]","['Gator Bait, 0.7478981]"
9,"[Panther Girl of the Kongo, 0.0001620832]","[Brotherhood of Blood, 0.7438413]"
