In [1]:
import re
import string
from difflib import SequenceMatcher
from itertools import chain
from typing import Sequence

import pandas as pd
import wikipedia
import wikipediaapi
import hashlib
from utils.scrape_utils import search_wikipedia, get_scrape_list
from utils.wiki_scrapper import GenericScrapper

In [2]:
# Autoreload
%load_ext autoreload
%autoreload 2

In [3]:
# Specify the delimiter (e.g., ',' for CSV, '\t' for tab, or '::' for MovieLens)
file_path = 'dataset/users.dat'
users_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
users_df.columns = ["UserID", "Gender", "Age", "Occupation","Zip-code"]
print(users_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB
None


In [4]:
file_path = 'dataset/ratings.dat'
ratings_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
ratings_df.columns = ["UserID","MovieID","Rating","Timestamp"]
print(ratings_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB
None


In [5]:
#  File being encoded in a different format, such as 'latin-1' or 'ISO-8859-1'.
file_path = 'dataset/movies.dat'
movies_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None, encoding='latin-1')
movies_df.columns = ["MovieID", "Title", "Genres"]
print(movies_df.info())
print(movies_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genres   3883 non-null   object
dtypes: int64(1), object(2)
memory usage: 91.1+ KB
None
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [19]:
display([elem for elem in movies_df.Title if "Mummy" in elem])

['Mummy, The (1999)',
 'Mummy, The (1932)',
 'Mummy, The (1959)',
 "Mummy's Curse, The (1944)",
 "Mummy's Ghost, The (1944)",
 "Mummy's Hand, The (1940)",
 "Mummy's Tomb, The (1942)"]

## Scraping data

Cases:
1. Movies with multiple series
2. Movies with multiple remakes

In [8]:
# Getting Items to scrape
web_scrape_list = movies_df.copy()
scrape_list = get_scrape_list(web_scrape_list)
len(scrape_list['title'].unique())

3840

In [22]:
scrape_list['title'].unique()

array(['$1,000,000 Duck', "'Night Mother", "'Til There Was You", ...,
       'Zeus and Roxanne', 'Zone 39', 'eXistenZ'], dtype=object)

In [30]:
web_scrape_list

Unnamed: 0,MovieID,Title,Genres,title,year
0,1,Toy Story (1995),Animation|Children's|Comedy,Toy Story,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995
...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents,2000
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream,2000
3880,3950,Tigerland (2000),Drama,Tigerland,2000
3881,3951,Two Family House (2000),Drama,Two Family House,2000


In [6]:
## Initialise
scrapper = GenericScrapper()

#### Using Wikipedia API

In [27]:
# Example to get scraper
sections = scrapper.get_section_dict('King Kong (2005)')

In [None]:
sections["Plot"]

In [None]:
scrape_df = web_scrape_list

def check_valid_page(page):
    if not page.exists():
        return False
    if len(page.sections) <= 1:
        return False
    # for section in page.sections:
    #     if (section.title == 'See also') and ('(disambiguation)' in section.text):
    #         return False
    return True


def fix_malformed_title(title: str) -> str:
    title = title.strip()
    alias_title = re.match(r'(.*?) (\(.+\))', title)
    if alias_title is not None:
        title = alias_title.group(1)
    if title.endswith((', The', ', Les')):
        title = f'The {title[:-5]}'
    return title


def get_search_title(title: str):
    return f'{fix_malformed_title(title)} (film)'


def generate_candidate_titles(title: str) -> list[str]:
    fixed_title = fix_malformed_title(title)
    title_case_title = string.capwords(fixed_title)
    titles = [fixed_title, title_case_title]
    firstname_format = re.match((r'(.*?), (\w+)$'), fixed_title)
    if firstname_format is not None:
        title = f'{string.capwords(firstname_format.group(2))} {firstname_format.group(1)}'
        titles.append(title)
    candidate_titles = [
        [
            f'{modified_title} ({row.year} film)',
            f'{modified_title} (film)',
            f'{modified_title} ({row.year})',
            modified_title,
        ]
        for modified_title in titles
    ]
    candidate_titles = list(dict.fromkeys(chain.from_iterable(candidate_titles)))
    return candidate_titles

def get_best_search_matches(search_titles: list[str], candidate_titles: list[str], threshold: float = 0.85) -> list[str]:
    filtered_titles = []
    for search_title in search_titles:
        best_score = max(SequenceMatcher(None, search_title, candidate).ratio() for candidate in candidate_titles)
        if best_score > threshold:
            filtered_titles.append((best_score, search_title))
    filtered_titles = sorted(filtered_titles, reverse=True)
    return filtered_titles
            

plot_regex = re.compile('[Pp]lot|Synopsis|Summary.*')
scraped_content = dict()
scraped_signature = dict()
scraped_title = dict()
wiki_wiki = wikipediaapi.Wikipedia('MyProjectName (dhytra97@gmail.com)', 'en')
for row in scrape_df.itertuples():
    if row.Title in scraped_title:
        continue
    candidate_titles = generate_candidate_titles(get_search_title(row.title))
    for title in candidate_titles:
        page_py = wiki_wiki.page(title)
        if check_valid_page(page_py):
            break
    else:
        print(f'{row.Title} page not found; searching for matches')
        search_titles = wikipedia.search(row.title)
        print(f'{search_titles=}')
        candidate_search_titles = get_best_search_matches(search_titles, candidate_titles)
        print(f'{candidate_search_titles=}')
        if len(candidate_search_titles) == 0:
            print(f'{row.Title} page not found; skipping')
            print(f'{candidate_titles=}')
            continue
        for score, title in candidate_search_titles:
            page_py = wiki_wiki.page(title)
            if check_valid_page(page_py):
                print(f'title found: {title}')
                break
        else:
            print(f'{row.Title} page not found; skipping')
            print(f'{candidate_search_titles=}')
            continue
    plot_sections = [section.title for section in page_py.sections if plot_regex.match(section.title)]
    if len(plot_sections) != 1:
        print(f'Invalid sections for {title}: {plot_sections}')
        print(f'{candidate_titles=}')
        search_titles = wikipedia.search(get_search_title(row.title))
        print(f'{search_titles=}')
        candidate_search_titles = get_best_search_matches(search_titles, candidate_titles)
        print(f'{candidate_search_titles=}')
        if len(candidate_search_titles) == 0:
            print(f'{row.Title} page not found from search; skipping')
            print(f'{candidate_titles=}')
            continue
        for score, title in candidate_search_titles:
            page_py = wiki_wiki.page(title)
            if check_valid_page(page_py):
                print(f'title found: {title}')
                break
        else:
            print(f'{row.Title} page not found from search; skipping')
            continue
    plot_sections = [section.title for section in page_py.sections if plot_regex.match(section.title)]
    if len(plot_sections) != 1:
        print(f'Invalid sections for {title} after search: {plot_sections}')
        continue
    plot_text = page_py.section_by_title(plot_sections[0]).text
    scraped_content[title] = plot_text
    text_hash = hashlib.md5(plot_text.encode()).hexdigest()
    scraped_signature[title] = text_hash
    scraped_title[row.Title] = title
    # break
print(len(scraped_signature))
scraped_signature

In [11]:
scrape_df.loc[:, 'mapped_title'] = scrape_df['Title'].map(scraped_title)
scrape_df.loc[:, 'text_hash'] = scrape_df['mapped_title'].map(scraped_signature)
scrape_df.loc[:, 'text'] = scrape_df['mapped_title'].map(scraped_content)
scrape_df.to_parquet('scraped_2024-10-13.parquet')
scrape_df

Unnamed: 0,MovieID,Title,Genres,title,year,mapped_title,text_hash,text
0,1,Toy Story (1995),Animation|Children's|Comedy,Toy Story,1995,Toy Story (film),d68f6979e62e69e49965edd36cd0dd99,"A group of sentient toys, who pretend to be li..."
1,2,Jumanji (1995),Adventure|Children's|Fantasy,Jumanji,1995,Jumanji (1995 film),71cf9481d07ce1019d4bb533323a1e6d,"In 1969, Alan Parrish lives in Brantford, New ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,Grumpier Old Men,3ae338c16ce71400350e56555fe64615,The feud between Max and John has cooled and t...
3,4,Waiting to Exhale (1995),Comedy|Drama,Waiting to Exhale,1995,Waiting to Exhale,cbbb08fcba4ea8e9bbcb7baa7aaca1cc,"Four friends (Savannah, Robin, Bernadine, and ..."
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,Father of the Bride Part II,b1100a9962ec154efae8431defaae021,"Four years after the events of the first film,..."
...,...,...,...,...,...,...,...,...
95,97,"Hate (Haine, La) (1995)",Drama,"Hate (Haine, La)",1995,Hate (1995 film),fd35eae0d45655eae166b21a122d3622,La Haine opens with a montage of news footage ...
96,98,Shopping (1994),Action|Thriller,Shopping,1994,Shopping (1994 film),33d8d20be828ce2dd31cf39049d4dcc4,"Billy is released from prison, but the police ..."
97,99,Heidi Fleiss: Hollywood Madam (1995),Documentary,Heidi Fleiss: Hollywood Madam,1995,Heidi Fleiss: Hollywood Madam,9ae3f2918e5f11f51e65bcfa4ecfe6c3,The documentary begins with news footage from ...
98,100,City Hall (1996),Drama|Thriller,City Hall,1996,City Hall (1996 film),5e858a817598bcb05405fa55191d5e82,The film opens as New York City Mayor John Pap...


In [141]:
wikipedia.search('Next Stop, Wonderland')

['Next Stop Wonderland',
 'Brad Anderson (director)',
 'Wonderland Amusement Park (Massachusetts)',
 'Holland Taylor',
 'Sam Seder',
 'Cara Buono',
 'Lawrence Gilliard Jr.',
 'Hope Davis',
 'Self-Reliance',
 'Philip Seymour Hoffman']

In [161]:
SequenceMatcher(None, 'Bedrooms & Hallways', 'Bedrooms and Hallways').ratio()

0.9