<a href="https://colab.research.google.com/github/maancham/Research-Materials/blob/main/Movielens_trimming.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import re
from scipy import stats

from google.colab import drive
# drive.mount('/content/drive')

In [None]:
%%capture
!pip install wget
!pip install csv2tsv
import wget

In [None]:
%%capture
wget.download('https://files.grouplens.org/datasets/movielens/ml-25m.zip')
!unzip ml-25m.zip

In [None]:
df = pd.read_csv('/content/ml-25m/ratings.csv')
movie_df = pd.read_csv('/content/ml-25m/movies.csv')

In [None]:
min_year = 1975
min_interaction = 10

In [None]:
print("number of movies before filtering: ", len(movie_df))

number of movies before filtering:  62423


In [None]:
movie_df = pd.read_csv('/content/ml-25m/movies.csv')

def extract_year(title):
  if(len(title) > 7):
    year = title[-7:]
    return year[year.find("(")+1:year.find(")")]
  else:
    return None

movie_df['year'] = movie_df['title'].apply(extract_year)
movie_df['year'] = pd.to_numeric(movie_df['year'], errors='coerce')
movie_df = movie_df[movie_df['year'].notna()]
movie_df['year'] = movie_df.loc[:, 'year'].astype(int)


movie_df['title'] = movie_df['title'].str.replace(r'\(.*$', '', regex=True)
movie_df['genres'] = movie_df['genres'].str.replace(r'|', ' ', regex=True)

### Filtering based on released year
movie_df = movie_df[movie_df['year'] >= min_year]

### Filtering based on genre availability
movie_df = movie_df[movie_df['genres'] != '(no genres listed)']

movieId_list = movie_df['movieId'].to_list()

In [None]:
print("number of movies after filtering: ", len(movie_df))

number of movies after filtering:  43934


### User analysis

In [None]:
print("number of interactions before filtering: ", len(df))
df = df[df['movieId'].isin(movieId_list)]
print("number of interactions after filtering: ", len(df))

number of interactions before filtering:  23196449
number of interactions after filtering:  22662702


In [None]:
merged_df = pd.merge(df, movie_df, on='movieId')
by_user_rating = merged_df.groupby(by = 'userId').mean()

In [None]:
by_movie = merged_df.groupby(by = 'movieId').count()

In [None]:
by_movie.sort_values('userId', ascending=False)[:500].sum()

userId       10648566
rating       10648566
timestamp    10648566
title        10648566
genres       10648566
year         10648566
dtype: int64

In [None]:
unknown_movies = by_movie[by_movie['userId'] <= min_interaction].index.to_list()
super_temp = movie_df[~movie_df['movieId'].isin(unknown_movies)]
super_temp_idlist = super_temp['movieId'].to_list()
len(df[df['movieId'].isin(super_temp_idlist)])

22580182

In [None]:
old_users = by_user_rating[by_user_rating['year'] <= 1985].index.to_list()
merged_df[merged_df['userId'].isin(old_users)]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
863,1756,296,5.0,955125006,Pulp Fiction,Comedy Crime Drama Thriller,1994
7392,15128,296,5.0,943205645,Pulp Fiction,Comedy Crime Drama Thriller,1994
9585,19637,296,5.0,955406536,Pulp Fiction,Comedy Crime Drama Thriller,1994
11760,24105,296,4.0,946050563,Pulp Fiction,Comedy Crime Drama Thriller,1994
22266,45549,296,4.0,953010495,Pulp Fiction,Comedy Crime Drama Thriller,1994
...,...,...,...,...,...,...,...
22587738,159177,96933,3.0,1435134541,Despair,Drama Fantasy,1978
22589337,116426,3123,3.0,1018669350,Lauderdale,Comedy,1989
22604809,159177,126432,5.0,1435134437,Visitor to a Museum,Sci-Fi,1989
22625696,159177,32797,3.5,1435134518,Satan's Brew,Comedy Drama,1976


In [None]:
df = df[df['movieId'].isin(movieId_list)]
by_user_rating = df.groupby(by = 'userId').count().rating
low_userIds = by_user_rating[by_user_rating < 20].index.to_list()
df = df[~df['userId'].isin(low_userIds)]

### Dataset extension part:

In [None]:
new_user_df = pd.read_csv('/content/drive/MyDrive/research/new user ratings/new_user_houmch.csv', 
                          index_col=0)
new_ratings = new_user_df[['userId', 'movieId', 'rating', 'timestamp']]
new_df = pd.concat([df, new_ratings])
assert(len(df) + len(new_user_df) == len(new_df))

col_names = ['user_id:token','item_id:token','rating:float', 'timestamp:float']
new_df.columns = col_names


file_name = 'ml-25m.inter'
path = 'drive/MyDrive/research/atomic files/' + file_name
new_df.to_csv(path, sep="\t", index=False)

In [None]:
item_cols = ['movieId', 'title', 'year', 'genres']
movie_df = movie_df[item_cols]

icol_names = ['item_id:token', 'movie_title:token_seq', 'release_year:token', 'class:token_seq']
movie_df.columns = icol_names

file_name = 'ml-25m.item'
path = 'drive/MyDrive/research/atomic files/' + file_name
movie_df.to_csv(path, index=False)