## This project is to extract hyperlinks from Movie names from Excel sheet. Then it will scrap Genre of that movie from IMDB website.

In [None]:
import os
import pandas as pd
from tqdm import tqdm
from openpyxl import load_workbook
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [3]:
def loadFile(file_path):
    # Load the workbook and sheet
    wb = load_workbook(file_path)
    ws = wb.active
    
    # Lists to collect data
    names = []
    years = []
    hyperlinks = []
    
    # Read all rows in a specific column (column A)
    for row in ws.iter_rows(min_row=2, max_col=2): 
        title_cell = row[0]  # Column A
        year_cell = row[1]   # Column B
        
        names.append(title_cell.value)
        years.append(year_cell.value)
        
        if title_cell.hyperlink:
            hyperlinks.append(title_cell.hyperlink.target)
        else:
            hyperlinks.append(None)
    
    df = pd.DataFrame({
        "Name": names,
        "Year": years,
        "Link": hyperlinks
        
    })
    print(df.head(5))
    
    return df

In [4]:
# --- Configuration ---
CHECKPOINT_FILE = 'imdb_genres_checkpoint.csv'
FINAL_FILE = 'imdb_genres_complete.csv'
CHECKPOINT_FREQUENCY = 100 #

# --- Load Data and Prepare for Resuming ---
# Check if a checkpoint file exists to resume from
if os.path.exists(CHECKPOINT_FILE):
    print(f"Resuming from checkpoint file: {CHECKPOINT_FILE}")
    df = pd.read_csv(CHECKPOINT_FILE)
else:
    print("No checkpoint file found. Starting from scratch.")
    df= loadFile("Movies.xlsx")

    
    # Add a 'Genres' column if it doesn't exist, filled with None
    if 'Genres' not in df.columns:
        df['Genres'] = None

# Setup Chrome options
options = webdriver.ChromeOptions()
#options.add_argument('--headless') 
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

driver = webdriver.Chrome(options=options)

try:
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Scraping IMDb"):
        # --- Resume Logic: Skip already scraped URLs ---
        # If the 'Genres' column for this row already has a value, skip it.
        if pd.notna(row['Genres']):
            continue

        try:
            url = row['Link']
            driver.get(url)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.45);")
            WebDriverWait(driver, 15).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'section[data-testid="Storyline"]'))
                )
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'lxml')
            boxes = soup.find("li", attrs={"data-testid": "storyline-genres"}, class_ = "ipc-metadata-list__item ipc-metadata-list__item--align-end")
            df.loc[index, 'Genres'] = f'{boxes.text}'
        except Exception as e:
            df.loc[index, 'Genres'] = f'Error: {str(e)[:50]}'

        # --- Checkpointing Logic ---
        # The `+1` is because index starts at 0. We save on the 10th, 20th, etc. item.
        if (index + 1) % CHECKPOINT_FREQUENCY == 0:
            tqdm.write(f"\n--- Checkpoint: Saving progress to {CHECKPOINT_FILE} at item {index + 1} ---")
            df.to_csv(CHECKPOINT_FILE, index=False)

finally:
    # Quit the driver and save the final complete file
    print("\n--- Scraping finished or interrupted. Closing driver and saving final data. ---")
    driver.quit()
    df.to_csv(FINAL_FILE, index=False)
    print(f"Final data saved to {FINAL_FILE}")

# Display the final result
print("\n--- Final DataFrame ---")
print(df)

No checkpoint file found. Starting from scratch.
                                 Name  Year  \
0                   The Great Escape   1963   
1                             Patton   1970   
2  Star Wars Episode IV - A New Hope   1977   
3                  Empire of the Sun   1987   
4                              Alien   1979   

                                    Link  
0  https://www.imdb.com/title/tt0057115/  
1  https://www.imdb.com/title/tt0066206/  
2  https://www.imdb.com/title/tt0076759/  
3  https://www.imdb.com/title/tt0092965/  
4  https://www.imdb.com/title/tt0078748/  


Scraping IMDb:  10%|██████▎                                                        | 100/995 [06:50<1:00:53,  4.08s/it]


--- Checkpoint: Saving progress to imdb_genres_checkpoint.csv at item 100 ---


Scraping IMDb:  20%|█████████████                                                    | 200/995 [13:56<48:51,  3.69s/it]


--- Checkpoint: Saving progress to imdb_genres_checkpoint.csv at item 200 ---


Scraping IMDb:  30%|███████████████████▌                                             | 300/995 [20:48<45:15,  3.91s/it]


--- Checkpoint: Saving progress to imdb_genres_checkpoint.csv at item 300 ---


Scraping IMDb:  40%|██████████████████████████▏                                      | 400/995 [27:48<42:00,  4.24s/it]


--- Checkpoint: Saving progress to imdb_genres_checkpoint.csv at item 400 ---


Scraping IMDb:  50%|████████████████████████████████▋                                | 500/995 [34:55<34:03,  4.13s/it]


--- Checkpoint: Saving progress to imdb_genres_checkpoint.csv at item 500 ---


Scraping IMDb:  60%|███████████████████████████████████████▏                         | 600/995 [41:25<24:26,  3.71s/it]


--- Checkpoint: Saving progress to imdb_genres_checkpoint.csv at item 600 ---


Scraping IMDb:  70%|█████████████████████████████████████████████▋                   | 700/995 [47:18<17:57,  3.65s/it]


--- Checkpoint: Saving progress to imdb_genres_checkpoint.csv at item 700 ---


Scraping IMDb:  80%|████████████████████████████████████████████████████▎            | 800/995 [53:27<12:18,  3.79s/it]


--- Checkpoint: Saving progress to imdb_genres_checkpoint.csv at item 800 ---


Scraping IMDb:  90%|██████████████████████████████████████████████████████████▊      | 900/995 [59:32<05:27,  3.45s/it]


--- Checkpoint: Saving progress to imdb_genres_checkpoint.csv at item 900 ---


Scraping IMDb: 100%|███████████████████████████████████████████████████████████████| 995/995 [1:15:03<00:00,  4.53s/it]



--- Scraping finished or interrupted. Closing driver and saving final data. ---
Final data saved to imdb_genres_complete.csv

--- Final DataFrame ---
                                   Name  Year  \
0                     The Great Escape   1963   
1                               Patton   1970   
2    Star Wars Episode IV - A New Hope   1977   
3                    Empire of the Sun   1987   
4                                Alien   1979   
..                                  ...   ...   
990               Mufasa: The Lion King  2024   
991                              Nr. 24  2024   
992                         The Amateur  2025   
993                         Logan Lucky  2017   
994                             Heretic  2024   

                                       Link  \
0     https://www.imdb.com/title/tt0057115/   
1     https://www.imdb.com/title/tt0066206/   
2     https://www.imdb.com/title/tt0076759/   
3     https://www.imdb.com/title/tt0092965/   
4     https://www.imdb.co

In [5]:
scraped = pd.read_csv("imdb_genres_complete.csv")
scraped.head(10)

Unnamed: 0,Name,Year,Link,Genres
0,The Great Escape,1963,https://www.imdb.com/title/tt0057115/,GenresAdventureDramaThrillerWar
1,Patton,1970,https://www.imdb.com/title/tt0066206/,GenresBiographyDramaWar
2,Star Wars Episode IV - A New Hope,1977,https://www.imdb.com/title/tt0076759/,GenresActionAdventureFantasySci-Fi
3,Empire of the Sun,1987,https://www.imdb.com/title/tt0092965/,GenresDramaWar
4,Alien,1979,https://www.imdb.com/title/tt0078748/,GenresHorrorSci-Fi
5,Apocalypse Now,1979,https://www.imdb.com/title/tt0078788/,GenresDramaMysteryWar
6,Star Wars Episode V - The Empire Strikes Back,1980,https://www.imdb.com/title/tt0080684/,GenresActionAdventureFantasySci-Fi
7,The Thing,1982,https://www.imdb.com/title/tt0084787/?ref_=nv_...,GenresHorrorMysterySci-Fi
8,Star Wars Episode VI - Return Of The Jedi,1983,https://www.imdb.com/title/tt0086190/,GenresActionAdventureFantasySci-Fi
9,Aliens,1986,https://www.imdb.com/title/tt0090605/,GenresActionAdventureHorrorSci-FiThriller
