In [27]:
import pandas as pd
import re

In [None]:
# Download wikipedia dump from https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
!wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2

--2025-02-18 20:06:31--  https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream.xml.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:3:208:80:154:71, 208.80.154.71
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:3:208:80:154:71|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24642610651 (23G) [application/octet-stream]
Saving to: ‘enwiki-latest-pages-articles-multistream.xml.bz2’


2025-02-18 21:32:52 (4.54 MB/s) - ‘enwiki-latest-pages-articles-multistream.xml.bz2’ saved [24642610651/24642610651]



In [None]:
# Download index file from https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2
# This file is used to find the byte offset of each article in the dump
!wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2

--2025-02-18 20:00:23--  https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles-multistream-index.txt.bz2
Resolving dumps.wikimedia.org (dumps.wikimedia.org)... 2620:0:861:3:208:80:154:71, 208.80.154.71
Connecting to dumps.wikimedia.org (dumps.wikimedia.org)|2620:0:861:3:208:80:154:71|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 267766383 (255M) [application/octet-stream]
Saving to: ‘enwiki-latest-pages-articles-multistream-index.txt.bz2’


2025-02-18 20:01:19 (4.54 MB/s) - ‘enwiki-latest-pages-articles-multistream-index.txt.bz2’ saved [267766383/267766383]



In [7]:
# Extract wikipedia index file
#!bzip2 -d enwiki-latest-pages-articles-multistream-index.txt.bz2

# Parse wikipedia index file to extract movie indices
%run -i ../tools/parse_movie_index.py --index_file enwiki-latest-pages-articles-multistream-index.txt --out_file movie_index.txt

In [None]:
# Extract movie articles from wikipedia dump and create a csv file
%run -i ../tools/process_wiki_dump.py --dump_file enwiki-latest-pages-articles-multistream.xml.bz2 --index_file movie_index.txt --out_file ../download/movies.csv

In [77]:
# read movies database
movie_db = pd.read_csv('movies.csv').rename(columns=lambda x: x.strip().replace(" ", ""))


In [78]:
# print sample of movies database
movie_db.head()

Unnamed: 0,id,title,cast,plot,poster
0,3947,Blue Velvet (film),{{cast listing|<!-- Cast is in credits order ...,<!-- Per WP:FILMPLOT plot summaries for featu...,https://upload.wikimedia.org/wikipedia/en/f/fd...
1,4231,Buffy the Vampire Slayer (film),{{Cast listing|* [[Kristy Swanson]] as [[Buff...,[[Buffy Summers]] is a cheerleader at Hemery H...,https://upload.wikimedia.org/wikipedia/en/0/09...
2,4729,Batman &amp; Robin (film),{{div col}}* [[Arnold Schwarzenegger]] as [[V...,<!-- Per WP:FILMPLOT plot summaries for featu...,https://upload.wikimedia.org/wikipedia/en/3/37...
3,11585,Show Me Love (film),* [[Alexandra Dahlström]] as Elin Olsson* [[R...,Two girls Agnes and Elin attend school in th...,https://upload.wikimedia.org/wikipedia/en/9/96...
4,19055,Manufacturing Consent (film),,The film presents and illustrates Chomsky and ...,https://upload.wikimedia.org/wikipedia/en/1/11...


In [79]:
# print information about movies database
movie_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16299 entries, 0 to 16298
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      16299 non-null  int64 
 1   title   16299 non-null  object
 2   cast    16299 non-null  object
 3   plot    16288 non-null  object
 4   poster  16299 non-null  object
dtypes: int64(1), object(4)
memory usage: 636.8+ KB


In [86]:
def clean_column(text):
    """Removes Wikipedia markup, unnecessary tags, and extracts names."""
    if pd.isna(text) or text.lower() == "none":
        return "Unknown"
    
    # Remove Wikipedia templates, comments, and unnecessary symbols
    text = re.sub(r"\{\{.*?\}\}", "", text)  # Remove Wikipedia template tags
    text = re.sub(r"<!--.*?-->", "", text)  # Remove HTML comments
    text = re.sub(r"\[\[(?:[^\]|]*\|)?([^\]]+)\]\]", r"\1", text)  # Extract names from Wikipedia links
    text = re.sub(r"\*", "", text)  # Remove asterisks used for listing
    
    return text.strip()


In [87]:
movie_db.dropna(inplace=True)

In [88]:
# Remove markup from cast and plot column
movie_db["cast"] = movie_db["cast"].apply(clean_column)
movie_db["plot"] = movie_db["plot"].apply(clean_column)


In [90]:
# Print clean data
movie_db

Unnamed: 0,id,title,cast,plot,poster
0,3947,Blue Velvet (film),,College student Jeffrey Beaumont returns to hi...,https://upload.wikimedia.org/wikipedia/en/f/fd...
1,4231,Buffy the Vampire Slayer (film),Appearing in uncredited roles are Ben Affleck ...,Buffy Summers is a cheerleader at Hemery High ...,https://upload.wikimedia.org/wikipedia/en/0/09...
2,4729,Batman &amp; Robin (film),Arnold Schwarzenegger as Dr. Victor Fries / Mr...,Batman and his partner Robin encounter a new...,https://upload.wikimedia.org/wikipedia/en/3/37...
3,11585,Show Me Love (film),Alexandra Dahlström as Elin Olsson Rebecka Lil...,Two girls Agnes and Elin attend school in th...,https://upload.wikimedia.org/wikipedia/en/9/96...
4,19055,Manufacturing Consent (film),Unknown,The film presents and illustrates Chomsky and ...,https://upload.wikimedia.org/wikipedia/en/1/11...
...,...,...,...,...,...
16293,79040652,Late Shift (film),Leonie Benesch as Floria Sonja Riesen as Bea S...,Floria a devoted surgical nurse in Switzerlan...,https://upload.wikimedia.org/wikipedia/en/e/e9...
16294,79045397,Ithaqua (film),Luke Hemsworth Kevin Durand Michael Pitt Craig...,Survivors in a remote outpost in 19th century ...,https://upload.wikimedia.org/wikipedia/en/d/df...
16295,79045789,DraftThe Bitter End (film),Joan Collins as Wallis Simpson,The film documents the later years in the life...,https://upload.wikimedia.org/wikipedia/en/d/df...
16296,79057912,Vari (film),Ahmed Sharumeel as Abeeru Ashiyath Zaheena as ...,Abeeru a dedicated yet emotionally distant me...,https://upload.wikimedia.org/wikipedia/en/0/07...


In [98]:
# print cast in sample row
print(movie_db["cast"].iloc[1])

Appearing in uncredited roles are Ben Affleck as a basketball player  Ricki Lake as Charlotte  Seth Green as a vampire  and Alexis Arquette as the vampire DJ.


In [115]:
def extract_actors(text):
    """Extract actors from the cast column."""
    if pd.isna(text) or text.lower() == "none" or text.lower() == "unknown":
        return ""
    actors = ", ".join(re.findall(r"\b([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)\s+as", text))
    if actors == "":
        return text
    else:
        return actors

In [116]:
# Get the actors list
movie_db["actors"] = movie_db["cast"].apply(extract_actors)

In [117]:
movie_db

Unnamed: 0,id,title,cast,plot,poster,actors
0,3947,Blue Velvet (film),,College student Jeffrey Beaumont returns to hi...,https://upload.wikimedia.org/wikipedia/en/f/fd...,
1,4231,Buffy the Vampire Slayer (film),Appearing in uncredited roles are Ben Affleck ...,Buffy Summers is a cheerleader at Hemery High ...,https://upload.wikimedia.org/wikipedia/en/0/09...,"Ben Affleck, Ricki Lake, Seth Green, Alexis Ar..."
2,4729,Batman &amp; Robin (film),Arnold Schwarzenegger as Dr. Victor Fries / Mr...,Batman and his partner Robin encounter a new...,https://upload.wikimedia.org/wikipedia/en/3/37...,"Arnold Schwarzenegger, George Clooney, Eric Ll..."
3,11585,Show Me Love (film),Alexandra Dahlström as Elin Olsson Rebecka Lil...,Two girls Agnes and Elin attend school in th...,https://upload.wikimedia.org/wikipedia/en/9/96...,"Elin Olsson Rebecka Liljeberg, Agnes Ahlberg E..."
4,19055,Manufacturing Consent (film),Unknown,The film presents and illustrates Chomsky and ...,https://upload.wikimedia.org/wikipedia/en/1/11...,
...,...,...,...,...,...,...
16293,79040652,Late Shift (film),Leonie Benesch as Floria Sonja Riesen as Bea S...,Floria a devoted surgical nurse in Switzerlan...,https://upload.wikimedia.org/wikipedia/en/e/e9...,"Leonie Benesch, Floria Sonja Riesen, Bea Schmi..."
16294,79045397,Ithaqua (film),Luke Hemsworth Kevin Durand Michael Pitt Craig...,Survivors in a remote outpost in 19th century ...,https://upload.wikimedia.org/wikipedia/en/d/df...,Luke Hemsworth Kevin Durand Michael Pitt Craig...
16295,79045789,DraftThe Bitter End (film),Joan Collins as Wallis Simpson,The film documents the later years in the life...,https://upload.wikimedia.org/wikipedia/en/d/df...,Joan Collins
16296,79057912,Vari (film),Ahmed Sharumeel as Abeeru Ashiyath Zaheena as ...,Abeeru a dedicated yet emotionally distant me...,https://upload.wikimedia.org/wikipedia/en/0/07...,"Ahmed Sharumeel, Abeeru Ashiyath Zaheena, Shar..."


In [137]:
# Experiment 1: TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [138]:
# Step 1: Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(
    stop_words='english',  # Remove common English words (e.g., "the", "a")
    max_df=0.85,           # Ignore terms that appear in >85% of documents
    min_df=1,              # Include terms that appear in at least 1 document
)

In [140]:
# Step 2: Transform movie plots into TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movie_db['plot'])

In [163]:
# Step 3: Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [168]:
# Step 4: Function to get recommendations
def get_recommendations(title, cosine_sim=cosine_sim, movies=movie_db):
    # Get the index of the movie that matches the title
    idx = movie_db.index[movie_db['title'].str.contains(rf"\b{title}\b")].to_list()[0]
    
    # Get similarity scores for all movies with the given movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies based on similarity scores (descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top 3 most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:6]
    
    # Get movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the titles of recommended movies
    return movie_db['title'].iloc[movie_indices]

In [169]:
# Test the recommendation system
movie_title = 'Titanic'
recommendations = get_recommendations(movie_title)
print(f"Recommendations for '{movie_title}':")
print(recommendations)

Recommendations for 'Titanic':
10956    If Ever I See You Again (film)
2328                     My Life (film)
12205             Superpowerless (film)
7234                  Mary  Mary (film)
1461             White Christmas (film)
Name: title, dtype: object
