In [27]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the entire dataset to build the TF-IDF model (if feasible)
# If the dataset is too large, consider using a sample or use Dask as previously mentioned
movies = pd.read_csv('movies.csv')


# Global TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
tfidf_features = tfidf.get_feature_names_out()

def find_movie_features(title, chunk, tfidf):
    """ Find and return the TF-IDF vector of a specific movie from a chunk. """
    if title in chunk['title'].values:
        idx = chunk.index[chunk['title'] == title].tolist()[0]
        return tfidf.transform([chunk['genres'].iloc[idx]])

def recommend(movie_features, chunk, tfidf):
    """ Recommend movies based on cosine similarity from a chunk. """
    chunk_matrix = tfidf.transform(chunk['genres'])
    cosine_sim = cosine_similarity(movie_features, chunk_matrix)
    sim_scores = list(enumerate(cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get scores of the 10 most similar movies
    movie_indices = [i[0] for i in sim_scores]
    return chunk['title'].iloc[movie_indices]

# Search for the movie and its features
movie_title = 'Toy Story (1995)'
movie_features = None
chunk_size = os.path.getsize('movies.csv')

for chunk in pd.read_csv('movies.csv', chunksize=chunk_size):
    if movie_features is None:  # Only find features if not already found
        movie_features = find_movie_features(movie_title, chunk, tfidf)
    if movie_features is not None:
        break  # Stop searching once the movie features are found

# If movie features are found, perform recommendations
if movie_features is not None:
    results = []
    for chunk in pd.read_csv('movies.csv', chunksize=chunk_size):
        results.append(recommend(movie_features, chunk, tfidf))
    # Aggregate and display results
    for result in results:
        print(result)
else:
    print(f"Movie titled '{movie_title}' not found in the database.")


2203                                           Antz (1998)
3021                                    Toy Story 2 (1999)
3653        Adventures of Rocky and Bullwinkle, The (2000)
3912                      Emperor's New Groove, The (2000)
4780                                 Monsters, Inc. (2001)
9949     DuckTales: The Movie - Treasure of the Lost La...
10773                                     Wild, The (2006)
11604                               Shrek the Third (2007)
12969                       Tale of Despereaux, The (2008)
17431    Asterix and the Vikings (Astérix et les Viking...
Name: title, dtype: object


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the entire dataset to build the TF-IDF model (if feasible)
# If the dataset is too large, consider using a sample or use Dask as previously mentioned
movies = pd.read_csv('movies.csv')


# Global TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
tfidf_features = tfidf.get_feature_names_out()

def find_movie_features(title, chunk, tfidf):
    """ Find and return the TF-IDF vector of a specific movie from a chunk. """
    if title in chunk['title'].values:
        idx = chunk.index[chunk['title'] == title].tolist()[0]
        return tfidf.transform([chunk['genres'].iloc[idx]])

def recommend(movie_features, chunk, tfidf):
    """ Recommend movies based on cosine similarity from a chunk. """
    chunk_matrix = tfidf.transform(chunk['genres'])
    cosine_sim = cosine_similarity(movie_features, chunk_matrix)
    sim_scores = list(enumerate(cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return chunk['title'].iloc[movie_indices]

# Search for the movie and its features
movie_title = 'Toy Story (1995)'
movie_features = None
chunk_size = 1000

for chunk in pd.read_csv('movies.csv', chunksize=chunk_size):
    if movie_features is None:  # Only find features if not already found
        movie_features = find_movie_features(movie_title, chunk, tfidf)
    if movie_features is not None:
        break  # Stop searching once the movie features are found

# If movie features are found, perform recommendations
if movie_features is not None:
    results = []
    for chunk in pd.read_csv('movies.csv', chunksize=chunk_size):
        results.extend(recommend(movie_features, chunk, tfidf))
    # Display all results
    print("Recommended Movies:")
    for title in results:
        print(title)
else:
    print(f"Movie titled '{movie_title}' not found in the database.")


Recommended Movies:
Pagemaster, The (1994)
Kids of the Round Table (1995)
Space Jam (1996)
Jumanji (1995)
Indian in the Cupboard, The (1995)
NeverEnding Story III, The (1994)
Escape to Witch Mountain (1975)
Balto (1995)
James and the Giant Peach (1996)
Kid in King Arthur's Court, A (1995)
Borrowers, The (1997)
Darby O'Gill and the Little People (1959)
101 Dalmatians (One Hundred and One Dalmatians) (1961)
Alice in Wonderland (1951)
Quest for Camelot (1998)
Goonies, The (1985)
Warriors of Virtue (1997)
Flubber (1997)
Freaky Friday (1977)
Absent-Minded Professor, The (1961)
Lord of the Rings, The (1978)
Watership Down (1978)
Little Nemo: Adventures in Slumberland (1992)
American Tail, An (1986)
Bug's Life, A (1998)
All Dogs Go to Heaven (1989)
Thumbelina (1994)
Who Framed Roger Rabbit? (1988)
Return to Oz (1985)
NeverEnding Story, The (1984)
Adventures of Rocky and Bullwinkle, The (2000)
Emperor's New Groove, The (2000)
We're Back! A Dinosaur's Story (1993)
Dinosaur (2000)
Digimon: The M

In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the entire dataset to build the TF-IDF model
movies = pd.read_csv('movies.csv')


# Global TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])

def find_movie_features(title, chunk, tfidf):
    """ Find and return the TF-IDF vector of a specific movie from a chunk. """
    if title in chunk['title'].values:
        idx = chunk.index[chunk['title'] == title].tolist()[0]
        return tfidf.transform([chunk['genres'].iloc[idx]])

def recommend(movie_features, chunk, tfidf):
    """ Recommend movies based on cosine similarity from a chunk. """
    chunk_matrix = tfidf.transform(chunk['genres'])
    cosine_sim = cosine_similarity(movie_features, chunk_matrix)
    sim_scores = list(enumerate(cosine_sim[0]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores]
    return [(chunk['title'].iloc[i], sim_scores[i][1]) for i in movie_indices]

# Search for the movie and its features
movie_title = 'Toy Story (1995)'
movie_features = None
chunk_size = 1000

for chunk in pd.read_csv('movies.csv', chunksize=chunk_size):
    if movie_features is None:  # Only find features if not already found
        movie_features = find_movie_features(movie_title, chunk, tfidf)
    if movie_features is not None:
        break  # Stop searching once the movie features are found

# If movie features are found, perform recommendations
if movie_features is not None:
    all_recommendations = []
    for chunk in pd.read_csv('movies.csv', chunksize=chunk_size):
        all_recommendations.extend(recommend(movie_features, chunk, tfidf))

    # Sort recommendations by similarity score and pick top 10
    top_recommendations = sorted(all_recommendations, key=lambda x: x[1], reverse=True)[:10]

    # Display top 10 recommendations
    print("Top 10 Recommended Movies:")
    for title, score in top_recommendations:
        print(f"{title} (Score: {score})")
else:
    print(f"Movie titled '{movie_title}' not found in the database.")

Top 10 Recommended Movies:
Toy Story (1995) (Score: 1.0)
Rescuers Down Under, The (1990) (Score: 1.0)
McCabe & Mrs. Miller (1971) (Score: 1.0)
Maurice (1987) (Score: 1.0)
Grapes of Wrath, The (1940) (Score: 1.0)
Ernest Goes to Camp (1987) (Score: 1.0)
And Life Goes On (a.k.a. Life and Nothing More) (Zendegi va digar hich) (1992) (Score: 1.0)
Quo Vadis (1951) (Score: 1.0)
Tideland (2005) (Score: 1.0)
Treatment, The (2006) (Score: 1.0)
