In [1]:
import pandas as pd

In [35]:
# Autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Specify the delimiter (e.g., ',' for CSV, '\t' for tab, or '::' for MovieLens)
file_path = 'dataset/users.dat'
users_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
users_df.columns = ["UserID", "Gender", "Age", "Occupation","Zip-code"]
print(users_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040 entries, 0 to 6039
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   UserID      6040 non-null   int64 
 1   Gender      6040 non-null   object
 2   Age         6040 non-null   int64 
 3   Occupation  6040 non-null   int64 
 4   Zip-code    6040 non-null   object
dtypes: int64(3), object(2)
memory usage: 236.1+ KB
None


In [4]:
file_path = 'dataset/ratings.dat'
ratings_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None)
ratings_df.columns = ["UserID","MovieID","Rating","Timestamp"]
print(ratings_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB
None


In [5]:
#  File being encoded in a different format, such as 'latin-1' or 'ISO-8859-1'.
file_path = 'dataset/movies.dat'
movies_df = pd.read_csv(file_path, delimiter='::', engine='python', header=None, encoding='latin-1')
movies_df.columns = ["MovieID", "Title", "Genres"]
print(movies_df.head())

   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy


In [6]:
display([elem for elem in movies_df.Title if "Mummy" in elem])

['Mummy, The (1999)',
 'Mummy, The (1932)',
 'Mummy, The (1959)',
 "Mummy's Curse, The (1944)",
 "Mummy's Ghost, The (1944)",
 "Mummy's Hand, The (1940)",
 "Mummy's Tomb, The (1942)"]

## Scraping data

Cases:
1. Movies with multiple series
2. Movies with multiple remakes

In [37]:
from utils.scrape_utils import *
from utils.wiki_scrapper import GenericScrapper

In [38]:
# Getting Items to scrape
web_scrape_list = movies_df.copy()
scrape_list = get_scrape_list(web_scrape_list)
len(scrape_list['title'].unique())

3840

In [39]:
## Initialise
scrapper = GenericScrapper()

#### Using Wikipedia API

In [40]:
# Example to get scraper
sections = scrapper.get_section_dict('King Kong (2005)')

In [41]:
sections["Plot"]

'In 1933, during the Great Depression, struggling New York City vaudeville performer Ann Darrow is hired by financially troubled filmmaker Carl Denham to star in a film with actor Bruce Baxter. Ann is hesitant to join the picture until she learns her favorite playwright, Jack Driscoll, is the screenwriter. Filming takes place on the SS Venture, under Captain Englehorn, and under Carl\'s pretense it will be sailing to Singapore. In truth, Carl intends to film the mysterious Skull Island. Captain Englehorn reconsiders the voyage, prompted by his crew\'s speculation of trouble ahead. During the voyage, Ann and Jack fall in love.\nThe Venture receives a radio message informing Englehorn there is a warrant for Carl\'s arrest due to his defiance of the studio\'s orders to cease production, and instructing Englehorn to divert to Rangoon, but the ship becomes lost in fog and runs aground on Skull Island. Carl and others, including his film crew consisting of cameraman Herb, assistant Preston a

In [78]:
## Generic way to scrape and add context
search_df = web_scrape_list.merge(scrape_list, on="title")
cast = "Cast"
error_list = []
for index, row in search_df.iterrows():
    # title, _ = get_most_similar_title(row.Title)
    try:
        title = search_wikipedia(row.Title)
    except Exception as e:
        print(f"ERROR: {e} for {row.Title}")
    try:
    # Scrape 
        if row.unique_counts >1:
            # Search with the date
            sections = scrapper.get_section_dict(title)
        else:
            # Search without date
            sections = scrapper.get_section_dict(title)
        if len(sections.keys()) == 0:
            print(f"No sections found: {title}")
        # Check if plot exsits
        plot_check = [elem for elem in sections.keys() if "plot" in elem.lower()]
        if plot_check:
            # Add DATA
            search_df.at[index, 'Plot'] = sections[plot_check[0]]
        else:
            # Research
            sections = scrapper.get_section_dict(row.Title.replace(")", " film)"))

        ####################### To add in Cast #######################
        # # Animation vs Non-animation
        # voice_cast = [elem for elem in sections.keys() if "voice cast" in elem.lower()]
        # if voice_cast:
        #     cast = voice_cast[0]
        # else:
        #     cast = "Cast"   

        # search_df.at[index, 'Cast'] = sections[cast]

    except Exception as e:
        error_list.append(row.Title)
        print(f"ERROR: {e} for {title}")


ERROR: 'NoneType' object is not subscriptable for Star Maker, The (Uomo delle stelle, L') (1995)
No sections found: To Live
No sections found: Theodore Rex
ERROR: 'NoneType' object is not subscriptable for Under the Domin Tree (Etz Hadomim Tafus) (1994)
ERROR: 'NoneType' object is not subscriptable for Vie est belle, La (Life is Rosey) (1987)
ERROR: 'NoneType' object is not subscriptable for Day the Sun Turned Cold, The (Tianguo niezi) (1994)
No sections found: Girls Town
ERROR: 'NoneType' object is not subscriptable for Tashunga (1995)
No sections found: Love in Bloom
No sections found: The English Patient (disambiguation)
No sections found: Down by Law
ERROR: 'NoneType' object is not subscriptable for Jungle2Jungle (a.k.a. Jungle 2 Jungle) (1997)
No sections found: Full Speed
No sections found: Midaq Alley
ERROR: 'NoneType' object is not subscriptable for King Kong vs. Godzilla (Kingukongu tai Gojira) (1962)
No sections found: Prancer
No sections found: Planète Sauvage
No sections fo

In [80]:
search_df.to_csv('scrapped_data.csv')

In [74]:
search_wikipedia('French Twist')

['French Twist', 'French Twist (film)', 'Twist', "Let's Twist Again"]

In [19]:
display(sections.keys())

dict_keys(['Plot', 'Voice cast', 'Production', 'Release', 'Reception', 'Influence and legacy', 'Sequels and spin-off', 'See also', 'Notes', 'References', 'Bibliography', 'External links'])

In [71]:
for index, row in search_df.iterrows():
    if row['Title'] != row['Title']:
        print(row)

#### WikiMediaAPI (wptools)

In [259]:
scrapper.get_page_summary_w_release_date('King Kong (2005)',"2005")

 King Kong (2005) 2005 :: {{Film date|2005|12|05|[[New York City]]|2005|12|13|New Zealand|2005|12|14|United States}} Found 


'King Kong is a 2005 epic adventure monster film co-written, produced, and directed by Peter Jackson. It is the ninth entry in the King Kong franchise and the second remake of the 1933 film of the same title, the first being the 1976 remake. The film stars Naomi Watts, Jack Black, and Adrien Brody. Set in 1933, it follows the story of an ambitious filmmaker who coerces his cast and hired ship crew to travel to mysterious Skull Island. There they encounter prehistoric creatures and a legendary giant gorilla known as Kong, whom they capture and take to New York City.Development began in early 1995, when Universal Pictures approached Jackson to direct the remake of the original 1933 film. The project stalled in early 1997, as several ape and giant monster-related films were under production at the time and Jackson planned to direct The Lord of the Rings film series. As the first two films in the Rings trilogy became commercially successful, Universal went back to Jackson in early 2003, ex