In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import operator
from scipy import spatial
from math import log
from random import shuffle
from sklearn.model_selection import train_test_split
import string
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [26]:
movie_data = pd.read_csv('data/movies.csv')
movies = movie_data['movieId'].unique().tolist()
print('Number of unique movies in the dataset: {}\n'.format(len(movies)))

genres = movie_data['genres'].unique().tolist()
unique_genres = set()
for genre_list in genres:
    sp = genre_list.split('|')
    for gen in sp:
        unique_genres.add(gen)
print('List of possible genres in the dataset:')
for genre in sorted(unique_genres)[:len(unique_genres)-1]:
    print(genre, end=', ')
print(sorted(unique_genres)[len(unique_genres)-1])
    
rating_data = pd.read_csv('data/ratings.csv')
unique_users = rating_data['userId'].unique().tolist()
print('\n\nNumber of users in the dataset: {}'.format(len(unique_users)))
print('Number of ratings in the dataset: {}'.format(len(rating_data['userId'].tolist())))

Number of unique movies in the dataset: 58098

List of possible genres in the dataset:
(no genres listed), Action, Adventure, Animation, Children, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, IMAX, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western


Number of users in the dataset: 283228
Number of ratings in the dataset: 27753444


In [9]:
movie_categories = dict()

id_genres = movie_data[['movieId','genres']].values
for pair in id_genres:
    movie_categories[pair[0]] = pair[1].split('|')

rating_movies = rating_data[['movieId']].values
category_counts = defaultdict(int)
for movie in rating_movies:
    for cat in movie_categories[movie[0]]:
        category_counts[cat] += 1
category_counts_list = list(category_counts.items())
category_counts_list.sort(key=operator.itemgetter(1))
category_counts_list.reverse()
print('Top 5 most watched movie categories are:')
top_categories = []
for i in range(5):
    top_categories.append(category_counts_list[i][0])
    print('{}'.format(category_counts_list[i][0]))

Top 5 most watched movie categories are:
Drama
Comedy
Action
Thriller
Adventure


In [10]:
movie_tags = pd.read_csv('data/tags.csv')
movie_tags.drop(columns=['timestamp'],inplace=True)
movie_tags.drop_duplicates(inplace=True)

In [11]:
movie_tag_counts = defaultdict(list)
movie_tags_list = list(movie_tags[['movieId','tag']].values)
shuffle(movie_tags_list)
movie_genre_test = movie_tags_list[round(len(movie_tags_list) * 0.9):]
movie_tags_list = movie_tags_list[:round(len(movie_tags_list) * 0.9)]
translator = str.maketrans('','',string.punctuation)
for pair in movie_tags_list:
    if isinstance(pair[1],str):
        for tag in (pair[1].translate(translator)).lower().split():
            movie_tag_counts[pair[0]].append(tag)
def get_defaultdict_int():
    return defaultdict(int)


In [28]:
def get_defaultdict_float():
    return defaultdict(float)
category_tags = defaultdict(get_defaultdict_int)
for movie in movie_tag_counts:
    for category in movie_categories[movie]:
        for tag in movie_tag_counts[movie]:
            category_tags[category][tag] += 1
category_totals = dict()
for category in category_tags:
    total = 0
    for tag in category_tags[category]:
        total += category_tags[category][tag]
    category_totals[category] = total

category_tf_scores = defaultdict(get_defaultdict_float)
for category in category_tags:
    for tag in category_tags[category]:
        category_tf_scores[category][tag] = category_tags[category][tag] / category_totals[category]
unique_tags = set()

for category in category_tf_scores:
    for tag in category_tf_scores[category]:
        unique_tags.add(tag)
tag_idf_scores = defaultdict(float)

for tag in list(unique_tags):
    doc_count = 0
    for category in category_tf_scores:
        if tag in category_tf_scores[category]:
            doc_count += 1
    tag_idf_scores[tag] = log(len(category_tf_scores)/doc_count)
    
tag_tfidf_scores = defaultdict(get_defaultdict_float)
for category in category_tags:
    for tag in list(unique_tags):
        tag_tfidf_scores[category][tag] = category_tf_scores[category][tag] * tag_idf_scores[tag]

for cat in top_categories:
    tags = list(tag_tfidf_scores[cat].items())
    tags.sort(key=operator.itemgetter(1))
    tags.reverse()
    print(cat)
    print(tags[:10])

Drama
[('tarantino', 0.000602792074558795), ('leonardo', 0.0005404502271375273), ('dicaprio', 0.0005165027850488554), ('quentin', 0.00043649655807079246), ('hanks', 0.0003467441612516442), ('austen', 0.00032399944323757164), ('coen', 0.0003149682872754506), ('pacino', 0.00031430379048194123), ('gosling', 0.0003097173004475742), ('turing', 0.0002867089984020358)]
Comedy
[('standup', 0.0013303353368161667), ('pixar', 0.000882188603337392), ('tarantino', 0.0006424902284362475), ('coen', 0.0006210985889276307), ('monty', 0.0005783424451774059), ('sandler', 0.00048338271555121586), ('carell', 0.00045346629367054), ('quentin', 0.0004326594275085522), ('carrey', 0.0004135219799863888), ('wes', 0.00040189694527743955)]
Action
[('marvel', 0.0011110450737873386), ('tarantino', 0.0010121957050451004), ('quentin', 0.0007539178194794836), ('wars', 0.0006294883444200047), ('007', 0.0006191353426588836), ('mcu', 0.000514963364370246), ('schwarzenegger', 0.0005068533301141958), ('bond', 0.000506307984

In [47]:
all_categories = [category[0] for category in category_counts_list]



check_words = []
for cat in top_categories:
    tags = list(tag_tfidf_scores[cat].items())
    tags.sort(key=operator.itemgetter(1))
    tags.reverse()
    for i in range(10):
        check_words.append(tags[i][0])

movie_tag_pairs = movie_tags[['movieId','tag']].values
train,test = train_test_split(movie_tag_pairs)
        
def get_features(data,train=True):
    X = []
    y = []
    genre_list = []
    for datum in data:
        feature = []
        feature.append(1)
        movie_id = datum[0]
        tags = (str(datum[1]).translate(translator)).lower()
        for word in check_words:
            feature.append(word in tags)
        genres = movie_categories[movie_id]
        if not train:
            X.append(feature)
            genre_list.append(genres)
        for genre in genres:
            if train:
                X.append(feature)
            y.append(all_categories.index(genre))
    if train:
        return X,y
    else:
        return X,genre_list

X_train, y_train = get_features(train)
X_test, genres = get_features(test,False)

In [None]:
ovr = OneVsRestClassifier(LinearSVC(),n_jobs=2)
ovr.fit(X_train,y_train)

In [31]:
if 'timestamp' in rating_data.columns:
    rating_data.drop(columns=['timestamp'],inplace=True)
ratings = rating_data.values

array([[110, 'epic'],
       [110, 'Medieval'],
       [260, 'sci-fi'],
       ...,
       [73017, 'pacing'],
       [73017, 'plot'],
       [49651, 'Sylvester Stallone']], dtype=object)