<a href="https://colab.research.google.com/github/maancham/Research-Materials/blob/main/Movielens_trimming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
!pip install wget
!pip install csv2tsv
!pip install tmdbv3api

In [32]:
import pandas as pd
import numpy as np
import re
from scipy import stats
import wget
from urllib.request import urlopen
import requests
import time
import tqdm

pd.options.mode.chained_assignment = None

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%%capture
wget.download('https://files.grouplens.org/datasets/movielens/ml-25m.zip')
!unzip ml-25m.zip

df = pd.read_csv('/content/ml-25m/ratings.csv')
movie_df = pd.read_csv('/content/ml-25m/movies.csv')
link_df = pd.read_csv('/content/ml-25m/links.csv')

## Filtering:
Those movies who does not have a TMDB page or an image path will be dropped from the dataset since we can not show their details on the django website

In [5]:
broken_movieIds = []
data = urlopen('https://raw.githubusercontent.com/maancham/Research-Materials/main/tmdb_unknown')
for line in data:
    broken_movieIds.append(int(re.findall(r'\d+', str(line))[0]))

Filter link_df: drop where imdb or tmdb is none
collect brokens (404 response) and no imgs (add the other list to git)
In function: keep movies where their id is in link_df + their id is not in brokens or no imgs 

In [6]:
def extract_year(title):
  if(len(title) > 7):
    year = title[-7:]
    return year[year.find("(")+1:year.find(")")]
  else:
    return None
  
def filter_data(min_year, movie_df, df, n_core_movies = 10, n_core_users = 20):
  movie_df['year'] = movie_df['title'].apply(extract_year)
  movie_df['year'] = pd.to_numeric(movie_df['year'], errors='coerce')
  movie_df = movie_df[movie_df['year'].notna()]
  movie_df['year'] = movie_df.loc[:, 'year'].astype(int)

  movie_df['title'] = movie_df['title'].str.replace(r'\(.*$', '', regex=True)
  movie_df['genres'] = movie_df['genres'].str.replace(r'|', ' ', regex=True)

  ### Filtering based on released year
  movie_df = movie_df[movie_df['year'] >= min_year]

  ### Filtering based on genre availability
  movie_df = movie_df[movie_df['genres'] != '(no genres listed)']


  merged_df = pd.merge(df, movie_df, on='movieId')

  by_movie = merged_df.groupby(by = 'movieId').count()
  unknown_movies = by_movie[by_movie['userId'] < n_core_movies].index.to_list()
  unknown_movies.extend(link_df[link_df['tmdbId'].isnull()].movieId.to_list())

  movie_df = movie_df[~movie_df['movieId'].isin(unknown_movies)]
  movie_df = movie_df[~movie_df['movieId'].isin(broken_movieIds)]

  df = df[df['movieId'].isin(movie_df['movieId'].to_list())]
  by_user_rating = df.groupby(by = 'userId').count().rating
  low_userIds = by_user_rating[by_user_rating < n_core_users].index.to_list()
  df = df[~df['userId'].isin(low_userIds)]
  df.reset_index(inplace=True, drop=True)

  return movie_df, df

In [7]:
movie_df, df = filter_data(1950, movie_df, df, 20, 50)

print("Number of interactions: ", len(df))
print("Number of items: ", len(movie_df))

Number of interactions:  22302542
Number of items:  19348


## Saving movies for django backend:

In [8]:
from tmdbv3api import TMDb
from tmdbv3api import Movie

API_KEY = ''

tmdb = TMDb()
tmdb.api_key = API_KEY

In [21]:
tmdb_correction_dict = {
    12773 : 427910,  ## Navy Seals, movieId 4207
    58423 : 417859, ## Puss in Boots, movieId 90647
}

link_df = link_df.replace({"tmdbId": tmdb_correction_dict})


link_df = link_df[link_df['tmdbId'].notna()]
link_df['tmdbId'] = link_df['tmdbId'].astype(np.int64)

merged_movie_df = pd.merge(link_df, movie_df, on='movieId')
merged_movie_df['imdbId'] = merged_movie_df['imdbId'].apply(lambda x: 'tt' + str(x).rjust(7, '0'))

merged_movie_df['overview'] = ''
merged_movie_df['img_path'] = ''
merged_movie_df['runtime'] = 0
merged_movie_df['cast'] = ''
merged_movie_df['directors'] = ''
merged_movie_df['languages'] = ''
merged_movie_df['map'] = ''

In [22]:

movie = Movie()

for i, row in tqdm.tqdm(merged_movie_df.iterrows(), total=merged_movie_df.shape[0]):
  try:
    m = movie.details(row['tmdbId'])
    url = 'https://api.themoviedb.org/3/movie/' + str(row['tmdbId']) + '/credits?api_key=' + API_KEY
    tmdb_output = requests.get(url).json()

    merged_movie_df.at[i, 'title'] = m.title
    merged_movie_df.at[i, 'imdbId'] = m.imdb_id
    merged_movie_df.at[i, 'overview'] = m.overview
    merged_movie_df.at[i, 'runtime'] = int(m.runtime)

    for result in m.release_dates['results']:
      if result['iso_3166_1'] == 'US':
       merged_movie_df.at[i, 'map'] = result['release_dates'][0]['certification']

    if (m.poster_path):
      img_path = 'https://image.tmdb.org/t/p/w500' + m.poster_path
    else:
      img_path = None
    merged_movie_df.at[i, 'img_path'] = img_path

    genres = ''
    for item in m.genres:
      genres += item['name']
      genres += ', '

    cast_count = 0
    cast = ''
    for person in tmdb_output['cast']:
      if (cast_count == 5):
        break
      cast += person['name']
      cast += ', '
      cast_count += 1

    directors = ''
    for crew in tmdb_output['crew']:
      if (crew['job'] == 'Director'):
        directors += crew['name']
        directors += ', '

    langs = ''
    for lang in m.spoken_languages:
      langs += lang['name']
      langs += ', '

    genres = genres.rstrip(', ')
    cast = cast.rstrip(', ')
    directors = directors.rstrip(', ')
    langs = langs.rstrip(', ')

    merged_movie_df.at[i, 'genres'] = genres
    merged_movie_df.at[i, 'directors'] = directors
    merged_movie_df.at[i, 'cast'] = cast
    merged_movie_df.at[i, 'languages'] = langs

  except:
    print(row['movieId'])

100%|██████████| 19348/19348 [1:32:48<00:00,  3.47it/s]


### Final checks before saving:

In [None]:
"""
tests: 
make sure all links are clickable
check for empty field in: language, genre, cast, director, map, runtime
check distinct map values, trim values if necessary
"""
"""
movieId, year, runtime: int
title, overview, imdbId, tmdbId: str
img_path: link
genres, directors, cast, languages, map: str
"""

# merged_movie_df['runtime'].replace([np.nan, None], 0, inplace=True)

# merged_movie_df[merged_movie_df['runtime'] == 0]

In [44]:
file_name = 'movies.csv'
path = 'drive/MyDrive/research/' + file_name
merged_movie_df.to_csv(path, index=False)

## Dataset extension part:

In [None]:
new_user_df = pd.read_csv('/content/drive/MyDrive/research/new user ratings/new_user_houmch.csv', 
                          index_col=0)
new_ratings = new_user_df[['userId', 'movieId', 'rating', 'timestamp']]
new_df = pd.concat([df, new_ratings])
assert(len(df) + len(new_user_df) == len(new_df))

col_names = ['user_id:token','item_id:token','rating:float', 'timestamp:float']
new_df.columns = col_names


file_name = 'ml-25m.inter'
path = 'drive/MyDrive/research/atomic files/' + file_name
new_df.to_csv(path, sep="\t", index=False)

In [None]:
item_cols = ['movieId', 'title', 'year', 'genres']
movie_df = movie_df[item_cols]

icol_names = ['item_id:token', 'movie_title:token_seq', 'release_year:token', 'class:token_seq']
movie_df.columns = icol_names

file_name = 'ml-25m.item'
path = 'drive/MyDrive/research/atomic files/' + file_name
movie_df.to_csv(path, sep="\t", index=False)