In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
dataset_Diretory="./Datasets/ml-100k"

# Genre
genre_df = pd.read_csv(f'{dataset_Diretory}/u.genre', sep='|', encoding='latin-1')
genre_df.head(50)

In [None]:
# Movie
genre_columns = ["unknown"] + list(genre_df[genre_df.columns[0]].values)
movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies_df = pd.read_csv(f'{dataset_Diretory}/u.item', sep='|', names=movie_columns+genre_columns,
                     encoding='latin-1')
# movies_df.drop(["video_release_date","release_date","imdb_url","release_date","movie_id","title"],axis=1,inplace=True)
# movies_df.head()

# # Cleaning timestamp from titles
# def clean_title(title):
#     return re.sub("[\(\[].*?[\)\]]", "",title)

# movies_df['title'] = movies_df['title'].apply(clean_title)
# movies_df['title'] = movies_df['title'].str.strip()

movies_df.head()

In [None]:
# Compute the cosine similarity matrix
# similarity = cosine_similarity(movies_df)
# print(similarity)

In [None]:
#Ratings
ratings_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_df = pd.read_csv(f'{dataset_Diretory}/u.data', sep='\t', names=ratings_columns)
ratings_df.drop( "unix_timestamp", inplace = True, axis = 1 ) 
ratings_df.head(2)

In [None]:
# Step 1: Merge the ratings and movies dataframes on 'movie_id'
merged_df = pd.merge(ratings_df, movies_df, on='movie_id')

# Step 2: Identify the genre columns (columns from index 5 onwards in movies_df)
genre_cols = movies_df.columns[5:]

# Multiply each genre column by the 'rating' to get weighted genres
merged_df[genre_cols] = merged_df[genre_cols].mul(merged_df['rating'], axis=0)

# Step 3: Group by 'user_id' and sum the genre columns
user_genre_scores = merged_df.groupby('user_id')[genre_cols].sum().reset_index()

# Resulting dataset
print(user_genre_scores.head())

In [None]:
# # NORMALIZING USER DATA 

# # Identify genre columns (exclude 'user_id')
# genre_cols = user_genre_scores.columns.drop('user_id')

# # Calculate the maximum value for each user's row
# max_values = user_genre_scores[genre_cols].max(axis=1)

# # Normalize genre columns by dividing by the row's max value (handle zeros)
# user_genre_scores[genre_cols] = user_genre_scores[genre_cols].div(max_values, axis=0).fillna(0)

# # Display the normalized data
# print(user_genre_scores.head())

In [None]:
def get_unrated_movies(user_id, movies_df, ratings_df):
    # Get all unique movie IDs from the movies dataset
    all_movies = movies_df['movie_id'].unique()
    
    # Get movies rated by the specific user
    user_rated = ratings_df.loc[ratings_df['user_id'] == user_id, 'movie_id'].unique()
    
    # Find movies not rated by the user using set difference
    unrated_movies = np.setdiff1d(all_movies, user_rated)
    
    return unrated_movies

# Example: Get movies not rated by user_id = 1
unrated_movies = get_unrated_movies(user_id=1, movies_df=movies_df, ratings_df=ratings_df)
print("Unrated Movie IDs:", len(unrated_movies))

In [None]:
mod_mov=movies_df.drop(["video_release_date","release_date","imdb_url","release_date","movie_id","title"],axis=1,inplace=False)
mov_matrix=mod_mov.values

mod_user=user_genre_scores.drop(["user_id"],axis=1,inplace=False)
user_matrix=mod_user.values

score = np.dot(mov_matrix[5], user_matrix[1])
score

In [None]:
# Calculate scores for all movies against user 1
scores = np.dot(mov_matrix, user_matrix[582])
helf_len=int(len(mov_matrix)/2)
# Get indices of top 10 scores in descending order
top_10_indices = np.argsort(scores)[-10:][::-1]

# Extract corresponding scores
top_scores = scores[top_10_indices]

# Get movie details with scores
top_10_movies = movies_df.iloc[top_10_indices].copy()
top_10_movies['prediction_score'] = top_scores  # Add scores column

# Print results with scores
print("Top 10 Recommended Movies with Scores:")
for idx, movie in top_10_movies.iterrows():
    print(f"\nScore: {movie['prediction_score']:.4f}")
    print(f"Movie ID: {movie['movie_id']}")
    print(f"Title: {movie['title']}")
    print(f"Release Date: {movie['release_date']}")
    print("Genres:", ", ".join([col for col in movies_df.columns[5:] if movie[col] == 1]))
    print(f"IMDB URL: {movie['imdb_url']}")
    print("-" * 60)

## Test Data scraping from IMDB

In [None]:
# Install Package
%pip install cinemagoer


In [58]:
import re
import warnings
from imdb import Cinemagoer
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
def clean_title(title):
    return re.sub("[\(\[].*?[\)\]]","",title)
# Step 2: Import required libraries
# from imdb import IMDb

# Step 3: Example DataFrame
df =pd.read_csv(f'movies_with_credits.csv', sep='|', encoding='latin-1')

df=movies_df.copy()
# df.head()


# Step 4: IMDb instance
ia = Cinemagoer()

# Step 5: Update DataFrame with directors and cast info
for idx, row in df.iterrows():
    try:
        if idx < 1189:
            continue
        print(f"{(idx+1)} ID")
        print(f"{round((idx+1)*100/len(df),2)} % Complete")
        title =  clean_title(row['title'])
        print(f"processing {title}")
        if title=="":
            continue
        results = ia.search_movie(title)
        if not results:
            print(f"No results for: {title}")
            continue

        movie = results[0]
        ia.update(movie)

        # Get director names
        directors = [str(d) for d in movie.get('directors', [])]

        # Get top 5 cast members
        cast = [str(a) for a in movie.get('cast', [])[:5]]

        # Combine names
        people = directors + cast
        for person in people:
            if person not in df.columns:
                df.loc[:, person] = 0   # Create the column with default 0
            df.loc[idx, person] = 1  # Mark presence for this row
        
        # Save every 10 iterations
        csv_path = 'movies_with_credits.csv'
        if (idx + 1) % 10 == 0 or (idx + 1) == len(df):
            print(f"Saving progress at iteration {idx + 1}...")
            df.to_csv(csv_path, index=False)
    except Exception as e:
        print(f"{e} - Eroor happened with ID : - {idx}")
        continue 
df.head()

  return re.sub("[\(\[].*?[\)\]]","",title)


1160 ID
68.97 % Complete
processing Love! Valour! Compassion! 
Saving progress at iteration 1160...
1161 ID
69.02 % Complete
processing Palookaville 
1162 ID
69.08 % Complete
processing Phat Beach 
1163 ID
69.14 % Complete
processing Portrait of a Lady, The 
1164 ID
69.2 % Complete
processing Zeus and Roxanne 
1165 ID
69.26 % Complete
processing Big Bully 
1166 ID
69.32 % Complete
processing Love & Human Remains 
1167 ID
69.38 % Complete
processing Sum of Us, The 
1168 ID
69.44 % Complete
processing Little Buddha 
1169 ID
69.5 % Complete
processing Fresh 
1170 ID
69.56 % Complete
processing Spanking the Monkey 
Saving progress at iteration 1170...
1171 ID
69.62 % Complete
processing Wild Reeds 
1172 ID
69.68 % Complete
processing Women, The 
1173 ID
69.74 % Complete
processing Bliss 
1174 ID
69.8 % Complete
processing Caught 
1175 ID
69.86 % Complete
processing Hugo Pool 
1176 ID
69.92 % Complete
processing Welcome To Sarajevo 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1177 ID
69.98 % Complete
processing Dunston Checks In 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1178 ID
70.04 % Complete
processing Major Payne 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1179 ID
70.1 % Complete
processing Man of the House 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1180 ID
70.15 % Complete
processing I Love Trouble 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


Saving progress at iteration 1180...
1181 ID
70.21 % Complete
processing Low Down Dirty Shame, A 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1182 ID
70.27 % Complete
processing Cops and Robbersons 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1183 ID
70.33 % Complete
processing Cowboy Way, The 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1184 ID
70.39 % Complete
processing Endless Summer 2, The 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1185 ID
70.45 % Complete
processing In the Army Now 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1186 ID
70.51 % Complete
processing Inkwell, The 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1187 ID
70.57 % Complete
processing Switchblade Sisters 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1188 ID
70.63 % Complete
processing Young Guns II 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1189 ID
70.69 % Complete
processing Prefontaine 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


1190 ID
70.75 % Complete
processing That Old Feeling 


  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0
  df.loc[:, person] = 0   # Create the column with default 0


Saving progress at iteration 1190...
1191 ID
70.81 % Complete
processing Letter From Death Row, A 


  return re.sub("[\(\[].*?[\)\]]","",title)


KeyboardInterrupt: 

In [None]:
len(movies_df)