<a href="https://colab.research.google.com/github/maancham/Research-Materials/blob/main/Movielens_trimming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install wget
!pip install csv2tsv

In [2]:
import pandas as pd
import numpy as np
import re
from scipy import stats
import wget
from urllib.request import urlopen


pd.options.mode.chained_assignment = None

from google.colab import drive
# drive.mount('/content/drive')

In [3]:
%%capture
# wget.download('https://files.grouplens.org/datasets/movielens/ml-25m.zip')
# !unzip ml-25m.zip

df = pd.read_csv('/content/ml-25m/ratings.csv')
movie_df = pd.read_csv('/content/ml-25m/movies.csv')
link_df = pd.read_csv('/content/ml-25m/links.csv')

In [None]:
broken_movieIds = []
data = urlopen('https://raw.githubusercontent.com/maancham/Research-Materials/main/tmdb_unknown')
for line in data:
    broken_movieIds.append(int(re.findall(r'\d+', str(line))[0]))

In [4]:
def extract_year(title):
  if(len(title) > 7):
    year = title[-7:]
    return year[year.find("(")+1:year.find(")")]
  else:
    return None
  
def filter_data(min_year, movie_df, df, n_core_movies = 10, n_core_users = 20):
  movie_df['year'] = movie_df['title'].apply(extract_year)
  movie_df['year'] = pd.to_numeric(movie_df['year'], errors='coerce')
  movie_df = movie_df[movie_df['year'].notna()]
  movie_df['year'] = movie_df.loc[:, 'year'].astype(int)

  movie_df['title'] = movie_df['title'].str.replace(r'\(.*$', '', regex=True)
  movie_df['genres'] = movie_df['genres'].str.replace(r'|', ' ', regex=True)

  ### Filtering based on released year
  movie_df = movie_df[movie_df['year'] >= min_year]

  ### Filtering based on genre availability
  movie_df = movie_df[movie_df['genres'] != '(no genres listed)']


  merged_df = pd.merge(df, movie_df, on='movieId')

  by_movie = merged_df.groupby(by = 'movieId').count()
  unknown_movies = by_movie[by_movie['userId'] < n_core_movies].index.to_list()
  unknown_movies.extend(link_df[link_df['tmdbId'].isnull()].movieId.to_list())

  movie_df = movie_df[~movie_df['movieId'].isin(unknown_movies)]
  movie_df = movie_df[~movie_df['movieId'].isin(broken_movieIds)]

  df = df[df['movieId'].isin(movie_df['movieId'].to_list())]
  by_user_rating = df.groupby(by = 'userId').count().rating
  low_userIds = by_user_rating[by_user_rating < n_core_users].index.to_list()
  df = df[~df['userId'].isin(low_userIds)]
  df.reset_index(inplace=True, drop=True)

  return movie_df, df

In [5]:
movie_df, df = filter_data(1950, movie_df, df, 20, 50)

print("Number of interactions: ", len(df))
print("Number of items: ", len(movie_df))

Number of interactions:  22346263
Number of items:  19524


### Dataset extension part:

In [None]:
new_user_df = pd.read_csv('/content/drive/MyDrive/research/new user ratings/new_user_houmch.csv', 
                          index_col=0)
new_ratings = new_user_df[['userId', 'movieId', 'rating', 'timestamp']]
new_df = pd.concat([df, new_ratings])
assert(len(df) + len(new_user_df) == len(new_df))

col_names = ['user_id:token','item_id:token','rating:float', 'timestamp:float']
new_df.columns = col_names


file_name = 'ml-25m.inter'
path = 'drive/MyDrive/research/atomic files/' + file_name
new_df.to_csv(path, sep="\t", index=False)

In [None]:
item_cols = ['movieId', 'title', 'year', 'genres']
movie_df = movie_df[item_cols]

icol_names = ['item_id:token', 'movie_title:token_seq', 'release_year:token', 'class:token_seq']
movie_df.columns = icol_names

file_name = 'ml-25m.item'
path = 'drive/MyDrive/research/atomic files/' + file_name
movie_df.to_csv(path, sep="\t", index=False)

### Saving movies for django backend:

In [6]:
%%capture
!pip install tmdbv3api

In [7]:
from tmdbv3api import TMDb
from tmdbv3api import Movie

tmdb = TMDb()
tmdb.api_key = ''

In [23]:
link_df = pd.read_csv('/content/ml-25m/links.csv')

tmdb_correction_dict = {
    12773 : 427910,  ## Navy Seals, movieId 4207
    58423 : 417859, ## Puss in Boots, movieId 90647
}

link_df = link_df.replace({"tmdbId": tmdb_correction_dict})


link_df = link_df[link_df['tmdbId'].notna()]
link_df['tmdbId'] = link_df['tmdbId'].astype(np.int64)

merged_movie_df = pd.merge(link_df, movie_df, on='movieId')
merged_movie_df['imdbId'] = merged_movie_df['imdbId'].apply(lambda x: 'tt' + str(x).rjust(7, '0'))

merged_movie_df['overview'] = ''
merged_movie_df['img_path'] = ''

In [None]:
movie = Movie()
bad_rows = []

for i, row in merged_movie_df.iterrows():
  try:
    m = movie.details(row['tmdbId'])
    merged_movie_df.at[i, 'title'] = m.title
    merged_movie_df.at[i, 'imdbId'] = m.imdb_id
    merged_movie_df.at[i, 'overview'] = m.overview

    if (m.poster_path):
      img_path = 'https://image.tmdb.org/t/p/w500' + m.poster_path
    else:
      img_path = None
    merged_movie_df.at[i, 'img_path'] = img_path

    genres = ''
    for item in m.genres:
      genres += item['name']
      genres += ', '
    genres = genres.rstrip(', ')
    merged_movie_df.at[i, 'genres'] = genres
  except:
    bad_rows.append(row['movieId'])

bad_rows

In [54]:
merged_movie_df

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,year,overview,img_path
0,1,tt0114709,862,Toy Story,"Animation, Adventure, Family, Comedy",1995,"Led by Woody, Andy's toys live happily in his ...",https://image.tmdb.org/t/p/w500/uXDfjJbdP4ijW5...
1,2,tt0113497,8844,Jumanji,"Adventure, Fantasy, Family",1995,When siblings Judy and Peter discover an encha...,https://image.tmdb.org/t/p/w500/vgpXmVaVyUL7GG...
2,3,tt0113228,15602,Grumpier Old Men,"Romance, Comedy",1995,A family wedding reignites the ancient feud be...,https://image.tmdb.org/t/p/w500/1FSXpj5e8l4KH6...
3,4,tt0114885,31357,Waiting to Exhale,"Comedy, Drama, Romance",1995,"Cheated on, mistreated and stepped on, the wom...",https://image.tmdb.org/t/p/w500/4uw6HKq4vlhrSV...
4,5,tt0113041,11862,Father of the Bride Part II,"Comedy, Family",1995,Just when George Banks has recovered from his ...,https://image.tmdb.org/t/p/w500/rj4LBtwQ0uGrpB...
...,...,...,...,...,...,...,...,...
19519,208403,tt0316352,256452,One Hell of a Christmas,"Crime, Action, Comedy",2002,"""One Hell of a Christmas"" is a dark and action...",https://image.tmdb.org/t/p/w500/tDfM6aqf4XYun6...
19520,208405,tt0220025,108871,School's Out,"Horror, Mystery, Thriller",1999,A group of seniors decide to have a party in t...,https://image.tmdb.org/t/p/w500/4osh6Wu8cfzDVc...
19521,208407,tt0138647,32947,Angel of the Night,Horror,1998,Rebecca has inherited her grandmother's Gothic...,https://image.tmdb.org/t/p/w500/A1dsQQ1DUKzOpA...
19522,208411,tt0287665,101604,Eternal Blood,"Horror, Fantasy, Thriller",2002,Carmila is introduced by 'M' to a sinister rol...,https://image.tmdb.org/t/p/w500/oX5pBME2LlpU6z...
