In [2]:
# For data manipulation and analysis
import pandas as pd
import numpy as np

# For text preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import datetime


# For multilabel classification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# For model evaluation
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix


In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/jiayi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jiayi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### List of commonly used movie/tv shorthand notations
Including: notations, country codes (only including countries where top movies are created), ratings
Don't remove these

In [30]:
keep = []

# Country Codes for prominent film industries (ISO 3166-1 alpha-2 and alpha-3)
country_codes = [
    "US", "USA",  # United States
    "IN", "IND",  # India
    "GB", "GBR",  # United Kingdom
    "FR", "FRA",  # France
    "DE", "DEU",  # Germany
    "CN", "CHN",  # China
    "IT", "ITA",  # Italy
    "JP", "JPN",  # Japan
    "KR", "KOR",  # South Korea
    "RU", "RUS",  # Russia
    "AU", "AUS",  # Australia
    "CA", "CAN",  # Canada
    "ES", "ESP",  # Spain
    "BR", "BRA",  # Brazil
    "MX", "MEX"   # Mexico
]

keep = [
    "BBC", "CNN", "HBO", "FX", "MTV", "ESPN", "AMC", "TNT", "TBS", "VH1",
    "HD", "SD", "4K", "HDR", "UHD", "IMAX", "DV",
    "DD", "DTS", "THX",
    "OTT", "VOD", "DVR", "PPV", "FTA"
]

keep = keep + country_codes + [
    "G", "PG", "PG-13", "R", "NC-17", "U", "UA", "A", "S",
    "MA", "TV-Y", "TV-Y7", "TV-G", "TV-PG", "TV-14", "TV-MA"
]



### Importing Dataset - MovieLens 20M

In [2]:
# reading in the csv files as dataframes
genome_scores = pd.read_csv("../dataset/ml-20m/genome-scores.csv")
movies = pd.read_csv("../dataset/ml-20m/movies.csv")
genome_tags = pd.read_csv("../dataset/ml-20m/genome-tags.csv")
links = pd.read_csv("../dataset/ml-20m/links.csv")
tags = pd.read_csv("../dataset/ml-20m/tags.csv")
ratings = pd.read_csv("../dataset/ml-20m/ratings.csv")

dataframes = [("genome_scores",genome_scores),("movies",movies),("genome_tags",genome_tags),("links",links),("tags",tags),("ratings",ratings)]

In [32]:
# check basic infomation of dataset
for name, df in dataframes:
    print(f"{name} dimensions: {df.shape}")
    print(df.columns)
    print("-"*40)

genome_scores dimensions: (11709768, 3)
Index(['movieId', 'tagId', 'relevance'], dtype='object')
----------------------------------------
movies dimensions: (27278, 3)
Index(['movieId', 'title', 'genres'], dtype='object')
----------------------------------------
genome_tags dimensions: (1128, 2)
Index(['tagId', 'tag'], dtype='object')
----------------------------------------
links dimensions: (27278, 3)
Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')
----------------------------------------
tags dimensions: (465564, 4)
Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')
----------------------------------------
ratings dimensions: (20000263, 4)
Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')
----------------------------------------


## Pre-processing each dataframe
- Data conversions
- Investigating missing data proportions
- Naming conventions
- Identifying relationships between dataframes

In [33]:
# check missing data
def calculate_missing_data(df):
    missing_data = df.isnull().sum() / len(df)
    return missing_data

### (1) tags DF
- userId
- movieId
- tag: user assigned tags to movie items. Contains tags in different languages.
- timestamp: Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.

In [34]:
# tags exploration
tags['tag'] = tags['tag'].astype('str')

from collections import Counter
# Initialize Counter
tag_counter = Counter(tags['tag'])

# Filter tags based on length and count frequency of short tags
short_tags_counter = {k: v for k, v in tag_counter.items() if len(k) < 3}

# Sort by frequency in descending order
sorted_short_tags = {k: v for k, v in sorted(short_tags_counter.items(), key=lambda item: item[1], reverse=True)}

# Output frequencies of short tags
print("Frequency of short tags in descending order:")
for tag, freq in sorted_short_tags.items():
    print(f"{tag}: {freq}")

Frequency of short tags in descending order:
R: 756
3D: 112
PG: 80
3d: 67
TV: 64
3: 57
2: 35
SF: 28
G: 26
4: 26
DC: 24
1: 24
sg: 20
NR: 20
FX: 14
ok: 12
f: 10
NE: 8
Â : 7
AI: 7
MT: 6
HD: 5
bg: 5
MF: 5
2D: 4
Oz: 4
hs: 4
BD: 4
BP: 4
tv: 4
UK: 4
no: 4
dj: 3
SS: 3
s: 3
CG: 3
bc: 2
US: 2
DA: 2
X: 2
Id: 2
oz: 2
<3: 2
We: 1
ra: 1
ss: 1
GM: 1
fx: 1
hd: 1
vw: 1
ms: 1
cb: 1
OJ: 1
dc: 1
es: 1
wy: 1
Eh: 1
SC: 1
JR: 1
\: 1
89: 1
.: 1
il: 1
7: 1
Ok: 1
e: 1
c: 1
b: 1
a: 1
d: 1
M: 1
Q: 1
13: 1
TX: 1
10: 1
25: 1
85: 1
eh: 1
4d: 1
?: 1
dd: 1
Na: 1
l: 1
go: 1
Mu: 1
ex: 1
ds: 1
sd: 1
uk: 1
b5: 1
UR: 1


In [35]:
# data conversions
dt_dict = {'userId' : 'int', 'movieId' : 'int', 'tag' : 'str', 'timestamp' : 'int'} # timestamp to int -> then convert to date 
tags = tags.astype(dt_dict)
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')


# len 
print("len of tags: " + str(len(tags)))
# missing data
print("Missing data in tags")
print(calculate_missing_data(tags))

len of tags: 465564
Missing data in tags
userId       0.0
movieId      0.0
tag          0.0
timestamp    0.0
dtype: float64


#### Subset Data -> Tags (only 70% of users are included)
- Randomly generated random.seed(0)

In [36]:
# import random
# random.seed()

tags_full = tags
user_frac = 0.7
# Get a random sample of unique userIds
unique_user_ids = tags['userId'].unique()
subset_user_ids = np.random.choice(unique_user_ids, size=int(len(unique_user_ids) * user_frac), replace=False)
tags_sub = tags[tags['userId'].isin(subset_user_ids)]

In [49]:
# No. of tags (all languages)
len_all_sub = len(tags_sub['tag'].unique())
print("Subset len of tags(all languages): " + str(len_all_sub))

len_all_full = len(tags_full['tag'].unique())
print("Full set len of tags(all languages): " + str(len_all_full))

Subset len of tags(all languages): 26184
Full set len of tags(all languages): 31748


#### Filtering tags -> English only, Model: FastText

In [38]:
# Fasttext import
# import sys
# !{sys.executable} -m pip install fasttext

import urllib.request

url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
filename = "../pretrain_model/lid.176.bin"
# for fastText model
urllib.request.urlretrieve(url, filename)

import fasttext

# Load the model
language_model = fasttext.load_model("lid.176.bin")

# Define a function to detect language
def is_english(text):
    try:
        predictions = language_model.predict(text, k=1)
        return predictions[0][0] == '__label__en'
    except:
        return False

In [39]:
############# For tags (Subset) ############################# 
# Assuming `tags` is your DataFrame and it has a `tag` column
tags_sub['is_english'] = tags_sub['tag'].apply(is_english)

# Filter rows where the tag is in English
tags_sub = tags_sub[tags_sub['is_english']]

# Drop the 'is_english' column as it's no longer needed
tags_sub.drop(columns=['is_english'], inplace=True)


############# For tags_full ############################# 
# This is only for the POS tag section in CB model
# Assuming `tags` is your DataFrame and it has a `tag` column
tags_full['is_english'] = tags_full['tag'].apply(is_english)

# Filter rows where the tag is in English
tags_full = tags_full[tags_full['is_english']]

# Drop the 'is_english' column as it's no longer needed
tags_full.drop(columns=['is_english'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags_sub['is_english'] = tags_sub['tag'].apply(is_english)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tags_full.drop(columns=['is_english'], inplace=True)


In [40]:
# No. of subset tags (ENGLISH)
len_eng = len(tags_sub['tag'].unique())
# calculate the percentage of English tags:
per = len_eng/len_all_sub * 100
print(str(per) + " %" + " For tag subset")

# No. of full set tags (ENGLISH)
len_eng = len(tags_full['tag'].unique())
# calculate the percentage of English tags:
per = len_eng/len_all_full * 100
print(str(per) + " %" + " For tag full")

82.31891348088531 % For tag subset
82.15505641237966 % For tag full


In [41]:
len(tags_sub)

296698

In [40]:
# Save files
# tags_sub.to_csv("../dataset/subset.csv",index=False)
tags_full.to_csv("../dataset/tags_full.csv",index=False)

### (2) genome_scores DF

In [ ]:
dt_dict = {'movieId' : 'int', 'tagId' : 'int', 'relevance' : 'float'}
genome_scores = genome_scores.astype(dt_dict)
print("len of genome_scores: " + str(len(genome_scores)))

# missing data
print("Missing data in genome_scores")
print(calculate_missing_data(genome_scores))

### (3) genome_tags DF

In [ ]:
df_dict = {'tagId' : 'int', 'tag' : 'str'}
genome_tags = genome_tags.astype(df_dict)
print("len of genome_tags: " + str(len(genome_tags)))
# missing data
print("Missing data in genome_tags")
print(calculate_missing_data(genome_tags))


### (4) movies DF

In [ ]:
df_dict = {'movieId' : 'int', 'title' : 'str', 'genres' : 'str'}
movies = movies.astype(df_dict)
print("len of movies: " + str(len(movies)))
# missing data
print("Missing data in movies")
print(calculate_missing_data(movies))

### (5) links DF
- movieId
- imdbId
- tmdbId

In [42]:
# missing data
print("Missing data in links")
print(calculate_missing_data(links)) # very small missing data for tmdbId -> remove

Missing data in links
movieId    0.000000
imdbId     0.000000
tmdbId     0.009238
dtype: float64


- Use links to extract textual data about the movie - descriptions
- Use this to identify unreliable tags

In [43]:
links = links.dropna()
df_dict = {'movieId' : 'int', 'imdbId' : 'int', 'tmdbId' : 'int'}
links = links.astype(df_dict)
print("len of links: " + str(len(links)))

len of links: 27026
