In [31]:
import re
import string
from difflib import SequenceMatcher
from itertools import chain
import google.generativeai as genai
import os
import time

import joblib
import pandas as pd
import wikipedia
import wikipediaapi
import hashlib

from google.api_core.exceptions import ResourceExhausted

from utils.scrape_utils import search_wikipedia, get_scrape_list
from utils.wiki_scrapper import GenericScrapper

In [2]:
# Autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# Specify the delimiter (e.g., ',' for CSV, '\t' for tab, or '::' for MovieLens)
file_path = 'dataset/users.dat'
users_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
users_df.columns = ["UserID", "Gender", "Age", "Occupation","Zip-code"]
print(users_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB
None


In [3]:
file_path = 'dataset/ratings.dat'
ratings_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
ratings_df.columns = ["UserID","MovieID","Rating","Timestamp"]
print(ratings_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB
None


In [4]:
#  File being encoded in a different format, such as 'latin-1' or 'ISO-8859-1'.
file_path = 'dataset/movies.dat'
movies_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None, encoding='latin-1')
movies_df.columns = ["MovieID", "Title", "Genres"]
print(movies_df.info())
print(movies_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
None
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [19]:
display([elem for elem in movies_df.Title if "Mummy" in elem])

['Mummy, The (1999)',
 'Mummy, The (1932)',
 'Mummy, The (1959)',
 "Mummy's Curse, The (1944)",
 "Mummy's Ghost, The (1944)",
 "Mummy's Hand, The (1940)",
 "Mummy's Tomb, The (1942)"]

## Scraping data

Cases:
1. Movies with multiple series
2. Movies with multiple remakes

In [5]:
# Getting Items to scrape
web_scrape_list = movies_df.copy()
scrape_list = get_scrape_list(web_scrape_list)
len(scrape_list['title'].unique())

3840

In [22]:
scrape_list['title'].unique()

array(['$1,000,000 Duck', "'Night Mother", "'Til There Was You", ...,
       'Zeus and Roxanne', 'Zone 39', 'eXistenZ'], dtype=object)

In [30]:
web_scrape_list

Unnamed: 0,MovieID,Title,Genres,title,year
0,1,Toy Story (1995),Animation|Children's|Comedy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995
...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents,2000
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream,2000
3880,3950,Tigerland (2000),Drama,Tigerland,2000
3881,3951,Two Family House (2000),Drama,Two Family House,2000


In [6]:
## Initialise
scrapper = GenericScrapper()

#### Using Wikipedia API

In [27]:
# Example to get scraper
sections = scrapper.get_section_dict('King Kong (2005)')

In [None]:
sections["Plot"]

In [17]:
scrape_df = web_scrape_list

def check_valid_page(page):
    if not page.exists():
        return False
    if len(page.sections) <= 1:
        return False
    # for section in page.sections:
    #     if (section.title == 'See also') and ('(disambiguation)' in section.text):
    #         return False
    return True


def fix_malformed_title(title: str) -> str:
    title = title.strip()
    alias_title = re.match(r'(.*?) (\(.+\))', title)
    if alias_title is not None:
        title = alias_title.group(1)
    if title.endswith((', The', ', Les')):
        title = f'The {title[:-5]}'
    return title


def get_search_title(title: str):
    return f'{fix_malformed_title(title)} (film)'


def generate_candidate_titles(title: str) -> list[str]:
    fixed_title = fix_malformed_title(title)
    title_case_title = string.capwords(fixed_title)
    titles = [fixed_title, title_case_title]
    firstname_format = re.match((r'(.*?), (\w+)$'), fixed_title)
    if firstname_format is not None:
        title = f'{string.capwords(firstname_format.group(2))} {firstname_format.group(1)}'
        titles.append(title)
    candidate_titles = [
        [
            f'{modified_title} ({row.year} film)',
            f'{modified_title} (film)',
            f'{modified_title} ({row.year})',
            modified_title,
        ]
        for modified_title in titles
    ]
    candidate_titles = list(dict.fromkeys(chain.from_iterable(candidate_titles)))
    return candidate_titles

def get_best_search_matches(search_titles: list[str], candidate_titles: list[str], threshold: float = 0.85) -> list[str]:
    filtered_titles = []
    for search_title in search_titles:
        best_score = max(SequenceMatcher(None, search_title, candidate).ratio() for candidate in candidate_titles)
        if best_score > threshold:
            filtered_titles.append((best_score, search_title))
    filtered_titles = sorted(filtered_titles, reverse=True)
    return filtered_titles
            

plot_regex = re.compile('[Pp]lot|Synopsis|Summary.*')
scraped_content = dict()
scraped_signature = dict()
scraped_title = dict()
wiki_wiki = wikipediaapi.Wikipedia('MyProjectName (dhytra97@gmail.com)', 'en')
for row in scrape_df.itertuples():
    if row.Title in scraped_title:
        continue
    try:
        candidate_titles = generate_candidate_titles(get_search_title(row.title))
        for title in candidate_titles:
            page_py = wiki_wiki.page(title)
            if check_valid_page(page_py):
                break
        else:
            print(f'{row.Title} page not found; searching for matches')
            search_titles = wikipedia.search(row.title)
            print(f'{search_titles=}')
            candidate_search_titles = get_best_search_matches(search_titles, candidate_titles)
            print(f'{candidate_search_titles=}')
            if len(candidate_search_titles) == 0:
                print(f'{row.Title} page not found; skipping')
                print(f'{candidate_titles=}')
                continue
            for score, title in candidate_search_titles:
                page_py = wiki_wiki.page(title)
                if check_valid_page(page_py):
                    print(f'title found: {title}')
                    break
            else:
                print(f'{row.Title} page not found; skipping')
                print(f'{candidate_search_titles=}')
                continue
        plot_sections = [section.title for section in page_py.sections if plot_regex.match(section.title)]
        if len(plot_sections) != 1:
            print(f'Invalid sections for {title}: {plot_sections}')
            print(f'{candidate_titles=}')
            search_titles = wikipedia.search(get_search_title(row.title))
            print(f'{search_titles=}')
            candidate_search_titles = get_best_search_matches(search_titles, candidate_titles)
            print(f'{candidate_search_titles=}')
            if len(candidate_search_titles) == 0:
                print(f'{row.Title} page not found from search; skipping')
                print(f'{candidate_titles=}')
                continue
            for score, title in candidate_search_titles:
                page_py = wiki_wiki.page(title)
                if check_valid_page(page_py):
                    print(f'title found: {title}')
                    break
            else:
                print(f'{row.Title} page not found from search; skipping')
                continue
        plot_sections = [section.title for section in page_py.sections if plot_regex.match(section.title)]
        if len(plot_sections) != 1:
            print(f'Invalid sections for {title} after search: {plot_sections}')
            continue
        plot_text = page_py.section_by_title(plot_sections[0]).text
        scraped_content[title] = plot_text
        text_hash = hashlib.md5(plot_text.encode()).hexdigest()
        scraped_signature[title] = text_hash
        scraped_title[row.Title] = title
    except Exception as e:
        print(f'Error for {row.Title}: {e}')
        continue
    # break
print(len(scraped_signature))
scraped_signature

Invalid sections for How to Make an American Quilt: []
candidate_titles=['How to Make an American Quilt (1995 film)', 'How to Make an American Quilt (film)', 'How to Make an American Quilt (1995)', 'How to Make an American Quilt', 'How To Make An American Quilt (1995 film)', 'How To Make An American Quilt (film)', 'How To Make An American Quilt (1995)', 'How To Make An American Quilt']
search_titles=['How to Make an American Quilt', 'How to Make an American Quilt (novel)', 'Jocelyn Moorhouse', 'NAMES Project AIDS Memorial Quilt', 'Lecy Goranson', '1995 in film', 'Winona Ryder filmography', 'Melinda Dillon', 'Whitney Otto', 'List of awards and nominations received by Winona Ryder']
candidate_search_titles=[(1.0, 'How to Make an American Quilt'), (0.9041095890410958, 'How to Make an American Quilt (novel)')]
title found: How to Make an American Quilt
Invalid sections for How to Make an American Quilt after search: []
Invalid sections for Kicking and Screaming (1995 film): []
candidate_ti

{'Extreme Measures (1996 film)': '78434e64882bd30ec9e8cd529ed82796',
 'The Glimmer Man': '1b0f5a920add8adb31072b42fb749402',
 'D3: The Mighty Ducks': '78cbc3786d9d350491f8c2a12a906294',
 'The Chamber (1996 film)': 'a293d3e6b847bf47f5b8d83243004450',
 'The Apple Dumpling Gang (film)': '7e68dfab01a3d16e7d33518c11024ae4',
 'Davy Crockett, King of the Wild Frontier': 'd41d8cd98f00b204e9800998ecf8427e',
 'Escape to Witch Mountain (1975 film)': '0228684c2deb69fcd659ee503aa3e6f9',
 'The Love Bug (1969 film)': 'a250f4d90cdb1d0bde28d46f35a99f0e',
 'Herbie Rides Again': '222cee9fbda8eca068c315b76e4c13f0',
 'Old Yeller (1957 film)': '1acaeea9193dfbd49801ce3de1b3c618',
 'The Parent Trap (1961 film)': '947a982929053f019441cbe7cb3a2017',
 'Pollyanna (1960 film)': '6a40aafc925f24659c9fef30977b1780',
 'Homeward Bound: The Incredible Journey': '673bb935cca8352e3c80fc0dfe0565a2',
 'The Shaggy Dog (1959 film)': '0eb12861531e48c31dc53a7e78ca6fd3',
 'Swiss Family Robinson (1960 film)': '885865d64d54008dd43

In [18]:
scrape_df.loc[:, 'mapped_title'] = scrape_df['Title'].map(scraped_title)
scrape_df.loc[:, 'text_hash'] = scrape_df['mapped_title'].map(scraped_signature)
scrape_df.loc[:, 'text'] = scrape_df['mapped_title'].map(scraped_content)
scrape_df.to_parquet('scraped_2024-10-13.parquet')
scrape_df

Unnamed: 0,MovieID,Title,Genres,title,year,mapped_title,text_hash,text
0,1,Toy Story (1995),Animation|Children's|Comedy,Toy Story,1995,Toy Story (film),,
1,2,Jumanji (1995),Adventure|Children's|Fantasy,Jumanji,1995,Jumanji (1995 film),,
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,Grumpier Old Men,,
3,4,Waiting to Exhale (1995),Comedy|Drama,Waiting to Exhale,1995,Waiting to Exhale,,
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,Father of the Bride Part II,,
...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents,2000,Meet the Parents,0205542c91d125bfa618209ae7ba669d,"Greg Focker, a Jewish American nurse living in..."
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream,2000,Requiem for a Dream (film),362c5f9c1e0d53ccb956afdfd62fbd97,"Sara Goldfarb, a widow living alone in a Brigh..."
3880,3950,Tigerland (2000),Drama,Tigerland,2000,Tigerland,6155db001be96ec0c978942893ccd9cb,"In September 1971, the United States is losing..."
3881,3951,Two Family House (2000),Drama,Two Family House,2000,Two Family House,112204579a83ed03153a65a35ce7bd4f,An unseen narrator looks back to the year 1956...


In [20]:
len(scraped_signature) / scrape_df.shape[0]

0.6824620139067731

Generate summary

In [7]:
scrape_df = pd.read_parquet('scraped_2024-10-13.parquet')

In [8]:
genai.configure(api_key="AIzaSyDl1akKL6KNjiYTvJoDXfJmEbRNR3i7rAQ")
genai_model = genai.GenerativeModel('gemini-1.5-flash')

In [26]:
for row in scrape_df.itertuples():
    if pd.isnull(row.text):
        continue
    print(row.mapped_title)
    prompt = f"Summarize concisely the plot for the movie {row.mapped_title}. The plot is as follows: {row.text} "
    response = genai_model.generate_content(prompt)
    print(response.text)
    break

Extreme Measures (1996 film)
Dr. Guy Luthan encounters a mysterious homeless patient with strange symptoms and a mysterious wristband.  His pursuit of answers leads to his own downfall as he's framed for drug possession, losing his job, license, and friends. He discovers a secret underground organization led by Dr. Myrick, who is conducting deadly spinal experiments on homeless people in an attempt to find a cure for paralysis. Myrick tries to convince Guy to join his cause, but Guy believes Myrick's methods are unethical. Myrick is killed, and Guy receives his research, ultimately deciding to continue the work. 



In [9]:
try:
    len(summarized_text)
except Exception:
    summarized_text = dict()

for row in scrape_df.itertuples():
    if pd.isnull(row.text):
        continue
    if row.mapped_title in summarized_text:
        continue
    print(row.mapped_title)
    prompt = f"Summarize concisely the plot for the movie {row.mapped_title}. The plot is as follows: {row.text} "
    for i in range(10):
        try:
            response = genai_model.generate_content(prompt)
            break
        except ResourceExhausted:
            print(f'ResourceExhausted: sleeping. Current progress: {row.Index}')
            time.sleep(20)
    else:
        print(f'Failed to retrieve tokens; aborting')
        break
    # print(response.text)
    summarized_text[row.mapped_title] = response
    time.sleep(0.1)

print(len(summarized_text))
print(len(summarized_text) / scrape_df.shape[0])


Extreme Measures (1996 film)
The Glimmer Man
D3: The Mighty Ducks
The Chamber (1996 film)
The Apple Dumpling Gang (film)
Davy Crockett, King of the Wild Frontier
Escape to Witch Mountain (1975 film)
The Love Bug (1969 film)
Herbie Rides Again
Old Yeller (1957 film)
The Parent Trap (1961 film)
Pollyanna (1960 film)
Homeward Bound: The Incredible Journey
The Shaggy Dog (1959 film)
Swiss Family Robinson (1960 film)
That Darn Cat! (1965 film)
20,000 Leagues Under the Sea (1954 film)
Cool Runnings (film)
Angels in the Outfield (1994 film)
Cinderella (1950 film)
Winnie the Pooh and the Blustery Day
The Three Caballeros
The Sword in the Stone (1963 film)
So Dear to My Heart
Robin Hood: Prince of Thieves (film)
Mary Poppins (1964 film)
Dumbo (1941 film)
Pete's Dragon (1977 film)
Bedknobs and Broomsticks
Alice in Wonderland (1951 film)
The Fox and the Hound (1981 film)
Freeway (1996 film)
The Sound of Music (1965 film)
Die Hard (film)
The Lawnmower Man (1992 film)
Unhook the Stars
The Secret Ag

In [32]:
# joblib.dump(summarized_text, 'summarized_text.joblib')

['summarized_text.joblib']

In [19]:
summarized_text['Meet the Parents'].to_dict()['candidates'][0]['content']['parts'][0]['text']

"Greg Focker, a nurse, travels to Long Island to meet his girlfriend Pam's overbearing father, Jack, a retired CIA agent who suspects Greg is not good enough for his daughter. Greg's attempts to impress Jack backfire, leading to a series of mishaps and lies, ultimately culminating in Jack believing Greg is a marijuana user and a liar. However, after Pam defends Greg and reveals his true identity, Jack realizes his daughter truly loves him and finally accepts him. In a heartwarming twist, Jack encourages Greg to propose to Pam, marking a turning point in their relationship.  \n"

In [24]:
summarized_text['Meet the Parents'].text

"Greg Focker, a nurse, travels to Long Island to meet his girlfriend Pam's overbearing father, Jack, a retired CIA agent who suspects Greg is not good enough for his daughter. Greg's attempts to impress Jack backfire, leading to a series of mishaps and lies, ultimately culminating in Jack believing Greg is a marijuana user and a liar. However, after Pam defends Greg and reveals his true identity, Jack realizes his daughter truly loves him and finally accepts him. In a heartwarming twist, Jack encourages Greg to propose to Pam, marking a turning point in their relationship.  \n"

In [26]:
def get_data_from_genai_response(title: str, response):
    try:
        text = response.text
    except Exception as e:
        print(f'Error getting text from {title}: {e}')
        return ''
    return text

In [28]:
scrape_df.loc[:, 'summarized_text'] = scrape_df['mapped_title'].map(lambda x: get_data_from_genai_response(x, summarized_text[x]) if x in summarized_text else '')
display(scrape_df)
# scrape_df.to_parquet('scraped_with_gemini_summary_2024-10-13.parquet')

Error getting text from The Love Bug (1969 film): ("Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 3. The candidate's safety_ratings are: [category: HARM_CATEGORY_SEXUALLY_EXPLICIT\nprobability: MEDIUM\n, category: HARM_CATEGORY_HATE_SPEECH\nprobability: NEGLIGIBLE\n, category: HARM_CATEGORY_HARASSMENT\nprobability: NEGLIGIBLE\n, category: HARM_CATEGORY_DANGEROUS_CONTENT\nprobability: NEGLIGIBLE\n].", [category: HARM_CATEGORY_SEXUALLY_EXPLICIT
probability: MEDIUM
, category: HARM_CATEGORY_HATE_SPEECH
probability: NEGLIGIBLE
, category: HARM_CATEGORY_HARASSMENT
probability: NEGLIGIBLE
, category: HARM_CATEGORY_DANGEROUS_CONTENT
probability: NEGLIGIBLE
])
Error getting text from Cinderella (1950 film): ("Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were

Unnamed: 0,MovieID,Title,Genres,title,year,mapped_title,text_hash,text,summarized_text
0,1,Toy Story (1995),Animation|Children's|Comedy,Toy Story,1995,Toy Story (film),,,
1,2,Jumanji (1995),Adventure|Children's|Fantasy,Jumanji,1995,Jumanji (1995 film),,,
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,Grumpier Old Men,,,
3,4,Waiting to Exhale (1995),Comedy|Drama,Waiting to Exhale,1995,Waiting to Exhale,,,
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,Father of the Bride Part II,,,
...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents,2000,Meet the Parents,0205542c91d125bfa618209ae7ba669d,"Greg Focker, a Jewish American nurse living in...","Greg Focker, a nurse, travels to Long Island t..."
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream,2000,Requiem for a Dream (film),362c5f9c1e0d53ccb956afdfd62fbd97,"Sara Goldfarb, a widow living alone in a Brigh...",
3880,3950,Tigerland (2000),Drama,Tigerland,2000,Tigerland,6155db001be96ec0c978942893ccd9cb,"In September 1971, the United States is losing...","In 1971, at Fort Polk, rebellious draftee Rola..."
3881,3951,Two Family House (2000),Drama,Two Family House,2000,Two Family House,112204579a83ed03153a65a35ce7bd4f,An unseen narrator looks back to the year 1956...,


In [40]:
scrape_df[scrape_df['summarized_text'].str.len() > 0]

Unnamed: 0,MovieID,Title,Genres,title,year,mapped_title,text_hash,text,summarized_text
990,1003,Extreme Measures (1996),Drama|Thriller,Extreme Measures,1996,Extreme Measures (1996 film),78434e64882bd30ec9e8cd529ed82796,Dr. Guy Luthan is a New York emergency room do...,"Dr. Guy Luthan, an ER doctor, encounters a hom..."
991,1004,"Glimmer Man, The (1996)",Action|Thriller,"Glimmer Man, The",1996,The Glimmer Man,1b0f5a920add8adb31072b42fb749402,Jack Cole was once a Central Intelligence Agen...,"Former CIA operative Jack Cole, now a Los Ange..."
992,1005,D3: The Mighty Ducks (1996),Children's|Comedy,D3: The Mighty Ducks,1996,D3: The Mighty Ducks,78cbc3786d9d350491f8c2a12a906294,After their victory at the Junior Goodwill Gam...,"After winning the Junior Goodwill Games, the M..."
993,1006,"Chamber, The (1996)",Drama,"Chamber, The",1996,The Chamber (1996 film),a293d3e6b847bf47f5b8d83243004450,"In April 1967, the office of Marvin Kramer, a ...","""The Chamber"" tells the story of Adam Hall, a ..."
994,1007,"Apple Dumpling Gang, The (1975)",Children's|Comedy|Western,"Apple Dumpling Gang, The",1975,The Apple Dumpling Gang (film),7e68dfab01a3d16e7d33518c11024ae4,"Set in the Wild West in 1879, a slick gambler ...","In 1879, gambler Russell Donovan is tricked in..."
...,...,...,...,...,...,...,...,...,...
3872,3942,Sorority House Massacre II (1990),Horror,Sorority House Massacre II,1990,Sorority House Massacre II,7a160a1e6ea4e9d8d27daa370eb1c636,"Five women, Linda, Jessica, Kimberly, Suzanne ...","Five sorority sisters purchase a cheap house, ..."
3874,3944,Bootmen (2000),Comedy|Drama,Bootmen,2000,Bootmen,0b1e76bc5d4973f24eaa4cd221a0a76d,"Sean and Mitchell are young adult brothers, ha...","Two brothers, Sean (a dancer) and Mitchell (a ..."
3875,3945,Digimon: The Movie (2000),Adventure|Animation|Children's,Digimon: The Movie,2000,Digimon: The Movie,d41d8cd98f00b204e9800998ecf8427e,,Please provide me with the specific movie titl...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents,2000,Meet the Parents,0205542c91d125bfa618209ae7ba669d,"Greg Focker, a Jewish American nurse living in...","Greg Focker, a nurse, travels to Long Island t..."


In [42]:
# Let Gemini generate the plot summary if not found

generated_plot_summary = dict()
for row in scrape_df.itertuples():
    if len(row.summarized_text) > 0:
        continue
    title = row.mapped_title if pd.notnull(row.mapped_title) else row.Title
    prompt = f"Generate a concise summary of the plot for the film {title}, and return \"Unknown\" if the film is unknown."
    for i in range(3):
        try:
            response = genai_model.generate_content(prompt)
            break
        except ResourceExhausted:
            print(f'ResourceExhausted: sleeping. Current progress: {row.Index}')
            time.sleep(10)
    else:
        print(f'Failed to retrieve tokens; aborting')
        break
    try:
        generated_plot_summary[row.Title] = response.text
    except Exception as e:
        print(f'Error generating text for {title}')
        continue
print(len(generated_plot_summary))

Error generating text for Grumpier Old Men
Error generating text for Waiting to Exhale
Error generating text for Father of the Bride Part II
Error generating text for Sabrina (1995 film)
Error generating text for The American President (film)
Error generating text for Dracula: Dead and Loving It
Error generating text for Casino (1995 film)
Error generating text for Sense and Sensibility (1995 film)
Error generating text for Powder (1995 film)
Error generating text for Leaving Las Vegas
Error generating text for Othello (1995 film)
Error generating text for Persuasion (1995 film)
Error generating text for Shanghai Triad
Error generating text for Carrington (film)
Error generating text for Clueless (film)
Error generating text for Restoration (1995 film)
Error generating text for To Die For (film)
Error generating text for Pocahontas (1995 film)
Error generating text for Mighty Aphrodite
Error generating text for Eye for an Eye (1996 film)
Error generating text for Bed of Roses (1996 fil

In [43]:
scrape_df.loc[:, 'generated_summary'] = scrape_df['Title'].map(generated_plot_summary)
scrape_df.to_parquet('scraped_with_generated_summary)2024-10-13.parquet')

In [44]:
scrape_df

Unnamed: 0,MovieID,Title,Genres,title,year,mapped_title,text_hash,text,summarized_text,generated_summary
0,1,Toy Story (1995),Animation|Children's|Comedy,Toy Story,1995,Toy Story (film),,,,"Woody, a pull-string cowboy doll, is the belov..."
1,2,Jumanji (1995),Adventure|Children's|Fantasy,Jumanji,1995,Jumanji (1995 film),,,,"Two children, Judy and Peter, find a mysteriou..."
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,Grumpier Old Men,,,,
3,4,Waiting to Exhale (1995),Comedy|Drama,Waiting to Exhale,1995,Waiting to Exhale,,,,
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,Father of the Bride Part II,,,,
...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents,2000,Meet the Parents,0205542c91d125bfa618209ae7ba669d,"Greg Focker, a Jewish American nurse living in...","Greg Focker, a nurse, travels to Long Island t...",
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream,2000,Requiem for a Dream (film),362c5f9c1e0d53ccb956afdfd62fbd97,"Sara Goldfarb, a widow living alone in a Brigh...",,Four individuals in New York City struggle wit...
3880,3950,Tigerland (2000),Drama,Tigerland,2000,Tigerland,6155db001be96ec0c978942893ccd9cb,"In September 1971, the United States is losing...","In 1971, at Fort Polk, rebellious draftee Rola...",
3881,3951,Two Family House (2000),Drama,Two Family House,2000,Two Family House,112204579a83ed03153a65a35ce7bd4f,An unseen narrator looks back to the year 1956...,,Two Family House tells the story of a young co...
