In [1]:
import pandas as pd

In [2]:
# Autoreload
%load_ext autoreload
%autoreload 2

In [3]:
# Specify the delimiter (e.g., ',' for CSV, '\t' for tab, or '::' for MovieLens)
file_path = 'dataset/users.dat'
users_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
users_df.columns = ["UserID", "Gender", "Age", "Occupation","Zip-code"]
print(users_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB
None


In [4]:
file_path = 'dataset/ratings.dat'
ratings_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
ratings_df.columns = ["UserID","MovieID","Rating","Timestamp"]
print(ratings_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB
None


In [7]:
#  File being encoded in a different format, such as 'latin-1' or 'ISO-8859-1'.
file_path = 'dataset/movies.dat'
movies_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None, encoding='latin-1')
movies_df.columns = ["MovieID", "Title", "Genres"]
print(movies_df.head())

   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [6]:
display([elem for elem in movies_df.Title if "Mummy" in elem])

['Mummy, The (1999)',
 'Mummy, The (1932)',
 'Mummy, The (1959)',
 "Mummy's Curse, The (1944)",
 "Mummy's Ghost, The (1944)",
 "Mummy's Hand, The (1940)",
 "Mummy's Tomb, The (1942)"]

## Scraping data

Cases:
1. Movies with multiple series
2. Movies with multiple remakes

In [8]:
from utils.scrape_utils import *
from utils.wiki_scrapper import GenericScrapper

In [9]:
# Getting Items to scrape
web_scrape_list = movies_df.copy()
scrape_list = get_scrape_list(web_scrape_list)
display(scrape_list.head(15))

Unnamed: 0,title,unique_counts
0,"$1,000,000 Duck",1
1,'Night Mother,1
2,'Til There Was You,1
3,"'burbs, The",1
4,...And Justice for All,1
5,1-900,1
6,10 Things I Hate About You,1
7,101 Dalmatians,2
8,12 Angry Men,1
9,"13th Warrior, The",1


In [10]:
## Initialise
scrapper = GenericScrapper()

#### Using Wikipedia API

In [11]:
# Example to get scraper
sections = scrapper.get_section_dict('King Kong (2005)')

In [12]:
sections["Plot"]

'In 1933, during the Great Depression, struggling New York City vaudeville performer Ann Darrow is hired by financially troubled filmmaker Carl Denham to star in a film with actor Bruce Baxter. Ann is hesitant to join the picture until she learns her favorite playwright, Jack Driscoll, is the screenwriter. Filming takes place on the SS Venture, under Captain Englehorn, and under Carl\'s pretense it will be sailing to Singapore. In truth, Carl intends to film the mysterious Skull Island. Captain Englehorn reconsiders the voyage, prompted by his crew\'s speculation of trouble ahead. During the voyage, Ann and Jack fall in love.\nThe Venture receives a radio message informing Englehorn there is a warrant for Carl\'s arrest due to his defiance of the studio\'s orders to cease production, and instructing Englehorn to divert to Rangoon, but the ship becomes lost in fog and runs aground on Skull Island. Carl and others, including his film crew consisting of cameraman Herb, assistant Preston a

In [47]:
## Generic way to scrape and add context
search_df = web_scrape_list.merge(scrape_list, on="title")
cast = "Cast"
error_list = []
for index, row in search_df.iterrows():
    try:
    # Scrape 
        if row.unique_counts >1:
            # Search with the date
            sections = scrapper.get_section_dict(row.Title)
        else:
            # Search without date
            sections = scrapper.get_section_dict(row.title)
        
        # Check if plot exsits
        plot_check = [elem for elem in sections.keys() if "plot" in elem.lower()]
        if plot_check:
            # Add DATA
            search_df.at[index, 'Plot'] = sections['Plot']
        else:
            # Research
            sections = scrapper.get_section_dict(row.Title.replace(")", " film)"))

        ####################### To add in Cast #######################
        # # Animation vs Non-animation
        # voice_cast = [elem for elem in sections.keys() if "voice cast" in elem.lower()]
        # if voice_cast:
        #     cast = voice_cast[0]
        # else:
        #     cast = "Cast"   

        # search_df.at[index, 'Cast'] = sections[cast]

    except Exception as e:
        error_list.append(row.Title)
        print(f"ERROR: {e}")
        

In [46]:
display(row.Title)

'Cry, the Beloved Country (1995)'

In [43]:
scrapper.get_section_dict("Heat (1995 film)")

{'Plot': "Neil McCauley is a professional thief based in Los Angeles. He and his crew – right-hand man Chris Shiherlis, enforcer Michael Cheritto, driver Gilbert Trejo, and newly hired hand Waingro – rob $1.6 million in bearer bonds from an armored car. During the heist, Waingro kills a guard without provocation, forcing the crew to eliminate the other two guards. Later, McCauley prepares to kill Waingro in retaliation for the deaths of the guards, but he escapes.\nLos Angeles Police Department (LAPD) Lieutenant Vincent Hanna and his team investigate the robbery. Hanna, a dedicated lawman and former Marine, has a strained relationship with his third wife Justine, and struggles to connect with his stepdaughter, Lauren. McCauley, who lives a solitary life, begins a relationship with Eady, a graphic designer. They bond over their mutual isolation from society, and, claiming to be a metalworker, McCauley asks her to emigrate to New Zealand with him.\nMcCauley's fence, Nate, suggests he sel

In [34]:
display(sections.keys())

dict_keys(['Notation and units', 'History', 'Heat transfer', 'Latent and sensible heat', 'Heat capacity', '"Hotness"', 'Classical thermodynamics', 'See also', 'References', 'External links'])

In [30]:
display(search_df)

Unnamed: 0,MovieID,Title,Genres,title,year,unique_counts,Plot,Cast
0,1,Toy Story (1995),Animation|Children's|Comedy,Toy Story,1995,1,"A group of sentient toys, who pretend to be li...","Tom Hanks as Woody, a pullstring cowboy doll w..."
1,2,Jumanji (1995),Adventure|Children's|Fantasy,Jumanji,1995,1,,
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men,1995,1,,
3,4,Waiting to Exhale (1995),Comedy|Drama,Waiting to Exhale,1995,1,,
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II,1995,1,,
...,...,...,...,...,...,...,...,...
3877,3948,Meet the Parents (2000),Comedy,Meet the Parents,2000,1,,
3878,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream,2000,1,,
3879,3950,Tigerland (2000),Drama,Tigerland,2000,1,,
3880,3951,Two Family House (2000),Drama,Two Family House,2000,1,,


#### WikiMediaAPI (wptools)

In [259]:
scrapper.get_page_summary_w_release_date('King Kong (2005)',"2005")

 King Kong (2005) 2005 :: {{Film date|2005|12|05|[[New York City]]|2005|12|13|New Zealand|2005|12|14|United States}} Found 


'King Kong is a 2005 epic adventure monster film co-written, produced, and directed by Peter Jackson. It is the ninth entry in the King Kong franchise and the second remake of the 1933 film of the same title, the first being the 1976 remake. The film stars Naomi Watts, Jack Black, and Adrien Brody. Set in 1933, it follows the story of an ambitious filmmaker who coerces his cast and hired ship crew to travel to mysterious Skull Island. There they encounter prehistoric creatures and a legendary giant gorilla known as Kong, whom they capture and take to New York City.Development began in early 1995, when Universal Pictures approached Jackson to direct the remake of the original 1933 film. The project stalled in early 1997, as several ape and giant monster-related films were under production at the time and Jackson planned to direct The Lord of the Rings film series. As the first two films in the Rings trilogy became commercially successful, Universal went back to Jackson in early 2003, ex