In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example DataFrame
df = pd.DataFrame({
    'id': [1, 2, 3],
    'text': [
        "The quick brown fox jumps over the lazy dog.",
        "Never jump over the lazy dog quickly.",
        "A fast brown fox leaps over a sleepy dog."
    ]
})

# Create TF-IDF vector database
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['text'])

def query_vector_db(query, top_k=2):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarities.argsort()[::-1][:top_k]
    return df.iloc[top_indices][['id', 'text']]

# Example query
result = query_vector_db("fox jumps over dog")
print(result)

   id                                          text
0   1  The quick brown fox jumps over the lazy dog.
2   3     A fast brown fox leaps over a sleepy dog.


In [None]:
# Example movie DataFrame
movies_df = pd.DataFrame({
    'id': [101, 102, 103],
    'title': [
        "The Matrix",
        "Inception",
        "Interstellar"
    ],
    'description': [
        """In a dystopian future, Neo, a computer hacker, discovers that the world he lives in is a simulated reality called the Matrix, created by intelligent machines to subdue the human population while their bodies' heat and electrical activity are used as an energy source. With the help of Morpheus and Trinity, Neo learns the truth about his existence and joins a rebellion against the machines, facing mind-bending challenges and ultimately embracing his destiny as 'The One' who can manipulate the Matrix and free humanity.""",
        """Dom Cobb is a skilled thief, the absolute best in the dangerous art of extraction: stealing valuable secrets from deep within the subconscious during the dream state. Cobb is offered a chance to have his criminal history erased as payment for the implantation of another person's idea into a target's subconscious. As Cobb and his team of specialists plan and execute the perfect crime, they must navigate a labyrinth of dreams within dreams, all while Cobb is haunted by memories of his past and the specter of his late wife.""",
        """In the near future, Earth is plagued by drought and famine, causing a scarcity of food and drastic climate change. Cooper, a former NASA pilot, is recruited for a daring mission: to travel through a newly discovered wormhole near Saturn in search of a new habitable planet for humanity. Along with a team of scientists, Cooper journeys across galaxies, facing the relativity of time, the power of love, and the unknown dangers of alien worlds, all in a desperate bid to save his family and the human race."""
    ]
})
# Add genre, director, and actors columns to the movies_df
movies_df['genre'] = ['Sci-Fi', 'Sci-Fi', 'Sci-Fi']
movies_df['director'] = ['Lana Wachowski, Lilly Wachowski', 'Christopher Nolan', 'Christopher Nolan']
movies_df['actors'] = [
    'Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss',
    'Leonardo DiCaprio, Joseph Gordon-Levitt, Ellen Page',
    'Matthew McConaughey, Anne Hathaway, Jessica Chastain'
]


In [9]:
# Add more movies to movies_df
additional_movies = pd.DataFrame({
    'id': [104, 105],
    'title': [
        "Blade Runner",
        "Arrival"
    ],
    'description': [
        "In a future where synthetic humans known as replicants are bio-engineered by the powerful Tyrell Corporation to work on off-world colonies, Rick Deckard, a burnt-out cop, is reluctantly assigned to eliminate a group of escaped replicants hiding in Los Angeles.",
        "A linguist and a physicist are recruited by the military to communicate with alien lifeforms after twelve mysterious spacecraft appear around the world. As they learn to communicate, they experience profound changes in perception and understanding."
    ],
    'genre': [
        "Sci-Fi",
        "Sci-Fi"
    ],
    'director': [
        "Ridley Scott",
        "Denis Villeneuve"
    ],
    'actors': [
        "Harrison Ford, Rutger Hauer, Sean Young",
        "Amy Adams, Jeremy Renner, Forest Whitaker"
    ]
})

# Combine metadata for new movies
additional_movies['metadata'] = (
    additional_movies['genre'] + ' ' +
    additional_movies['director'] + ' ' +
    additional_movies['actors']
)

# Append to movies_df
movies_df = pd.concat([movies_df, additional_movies], ignore_index=True)

In [13]:

# Create TF-IDF vector database for movie descriptions
# Combine genre, director, and actors into a single string for each movie
movies_df['metadata'] = movies_df['genre'] + ' ' + movies_df['director'] + ' ' + movies_df['actors']

# Create TF-IDF vector database for movie metadata
movie_vectorizer = TfidfVectorizer()
movie_tfidf_matrix = movie_vectorizer.fit_transform(movies_df['metadata'])

In [14]:
# Query the movie TF-IDF vector database for the term "sb"
query = "Ridley Scott films"
query_vec = movie_vectorizer.transform([query])
similarities = cosine_similarity(query_vec, movie_tfidf_matrix).flatten()
top_indices = similarities.argsort()[::-1]
queried_movies = movies_df.iloc[top_indices][['id', 'title', 'metadata']]
print(queried_movies)

    id         title                                           metadata
3  104  Blade Runner  Sci-Fi Ridley Scott Harrison Ford, Rutger Haue...
4  105       Arrival  Sci-Fi Denis Villeneuve Amy Adams, Jeremy Renn...
2  103  Interstellar  Sci-Fi Christopher Nolan Matthew McConaughey, ...
1  102     Inception  Sci-Fi Christopher Nolan Leonardo DiCaprio, Jo...
0  101    The Matrix  Sci-Fi Lana Wachowski, Lilly Wachowski Keanu R...


In [17]:
import numpy as np

# Encode all additional_movies metadata
additional_encoded = np.vstack(additional_movies.apply(encode_movie_metadata, axis=1))
additional_query = "Ridley Scott"

# Encode the query using the same method (treating it as director for best match)
query_row = {
    'genre': '',
    'director': additional_query,
    'actors': ''
}
query_vec = encode_movie_metadata(query_row).reshape(1, -1)

# Compute cosine similarity
similarities = cosine_similarity(query_vec, additional_encoded).flatten()
top_indices = similarities.argsort()[::-1]
queried_movies = additional_movies.iloc[top_indices][['id', 'title', 'metadata']]
print(queried_movies)

    id         title                                           metadata
0  104  Blade Runner  Sci-Fi Ridley Scott Harrison Ford, Rutger Haue...
1  105       Arrival  Sci-Fi Denis Villeneuve Amy Adams, Jeremy Renn...


In [19]:
wiki_movies_df = pd.read_csv('..\wiki_movie_plots_deduped_cleaned.csv')
print(wiki_movies_df.head())


   Unnamed: 0  Release Year                             Title  \
0           0          1901            Kansas Saloon Smashers   
1           1          1901     Love by the Light of the Moon   
2           2          1901           The Martyred Presidents   
3           3          1901  Terrible Teddy, the Grizzly King   
4           4          1902            Jack and the Beanstalk   

  Origin/Ethnicity                            Director Cast    Genre  \
0         American                             Unknown  NaN  unknown   
1         American                             Unknown  NaN  unknown   
2         American                             Unknown  NaN  unknown   
3         American                             Unknown  NaN  unknown   
4         American  George S. Fleming, Edwin S. Porter  NaN  unknown   

                                           Wiki Page  \
0  https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...   
1  https://en.wikipedia.org/wiki/Love_by_the_Ligh...   
2  https

In [21]:
wiki_movies_df.columns

Index(['Unnamed: 0', 'Release Year', 'Title', 'Origin/Ethnicity', 'Director',
       'Cast', 'Genre', 'Wiki Page', 'Plot', 'genres_list', 'Genre_list',
       'Cast_list', 'genre_list_raw', 'genre_list', 'plot_clean',
       'plot_lemmatized'],
      dtype='object')

In [22]:
wiki_movies_df = wiki_movies_df[['Title', 'Release Year', 'Origin/Ethnicity', 'Director', 'Cast_list', 'genres_list', 'plot_clean']]

In [23]:
wiki_movies_df

Unnamed: 0,Title,Release Year,Origin/Ethnicity,Director,Cast_list,genres_list,plot_clean
0,Kansas Saloon Smashers,1901,American,Unknown,['nan'],['unknown'],a bartender is working at a saloon serving dri...
1,Love by the Light of the Moon,1901,American,Unknown,['nan'],['unknown'],the moon painted with a smiling face hangs ove...
2,The Martyred Presidents,1901,American,Unknown,['nan'],['unknown'],the film just over a minute long is composed o...
3,"Terrible Teddy, the Grizzly King",1901,American,Unknown,['nan'],['unknown'],lasting just seconds and consisting of two sho...
4,Jack and the Beanstalk,1902,American,"George S. Fleming, Edwin S. Porter",['nan'],['unknown'],the earliest known adaptation of the classic f...
...,...,...,...,...,...,...,...
34881,The Water Diviner,2014,Turkish,Director: Russell Crowe,['Director: Russell Crowe\r\nCast: Russell Cro...,['unknown'],the film begins in just after world war i has ...
34882,Çalgı Çengi İkimiz,2017,Turkish,Selçuk Aydemir,"['Ahmet Kural', 'Murat Cemcir']",['comedy'],two musicians salih and grkan described the ad...
34883,Olanlar Oldu,2017,Turkish,Hakan Algül,"['Ata Demirer', 'Tuvana Türkay', 'Ülkü Duru']",['comedy'],zafer a sailor living with his mother dnd in a...
34884,Non-Transferable,2017,Turkish,Brendan Bradley,"['YouTubers Shanna Malcolm', 'Shira Lazar', 'S...",['romantic comedy'],the film centres around a young woman named am...


In [None]:
from scipy.sparse import hstack

# Fit separate TF-IDF vectorizers for each metadata column
genre_vectorizer = TfidfVectorizer()
director_vectorizer = TfidfVectorizer()
cast_vectorizer = TfidfVectorizer()
plot_vectorizer = TfidfVectorizer()

genre_tfidf = genre_vectorizer.fit_transform(wiki_movies_df['genres_list'].iloc[0:100].astype(str))
director_tfidf = director_vectorizer.fit_transform(wiki_movies_df['Director'].iloc[0:100].astype(str))
cast_tfidf = cast_vectorizer.fit_transform(wiki_movies_df['Cast_list'].iloc[0:100].astype(str))
plot_tfidf = plot_vectorizer.fit_transform(wiki_movies_df['plot_clean'].iloc[0:100].astype(str))

# To add the vectors, they must have the same shape, so we need to concatenate features

wiki_metadata_tfidf = genre_tfidf + director_tfidf + cast_tfidf + plot_tfidf

KeyboardInterrupt: 