In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import operator
from scipy import spatial
from math import log
from random import shuffle

In [23]:
movie_data = pd.read_csv('data/movies.csv')
movies = movie_data['movieId'].unique().tolist()
print('Number of unique movies in the dataset: {}\n'.format(len(movies)))

genres = movie_data['genres'].unique().tolist()
unique_genres = set()
for genre_list in genres:
    sp = genre_list.split('|')
    for gen in sp:
        unique_genres.add(gen)
print('List of possible genres in the dataset:')
for genre in sorted(unique_genres)[:len(unique_genres)-1]:
    print(genre, end=', ')
print(sorted(unique_genres)[len(unique_genres)-1])
    
rating_data = pd.read_csv('data/ratings.csv')
unique_users = rating_data['userId'].unique().tolist()
print('\n\nNumber of users in the dataset: {}'.format(len(unique_users)))
print('Number of ratings in the dataset: {}'.format(len(rating_data['userId'].tolist())))

Number of unique movies in the dataset: 58098

List of possible genres in the dataset:
(no genres listed), Action, Adventure, Animation, Children, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, IMAX, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western


Number of users in the dataset: 283228
Number of ratings in the dataset: 27753444


In [50]:
movie_categories = dict()

id_genres = movie_data[['movieId','genres']].values
for pair in id_genres:
    movie_categories[pair[0]] = pair[1].split('|')

rating_movies = rating_data[['movieId']].values
category_counts = defaultdict(int)
for movie in rating_movies:
    for cat in movie_categories[movie[0]]:
        category_counts[cat] += 1
category_counts_list = list(category_counts.items())
category_counts_list.sort(key=operator.itemgetter(1))
category_counts_list.reverse()
print('Top 5 most watched movie categories are:')
top_categories = []
for i in range(5):
    top_categories.append(category_counts_list[i][0])
    print('{}'.format(category_counts_list[i][0]))

Top 5 most watched movie categories are:
Drama
Comedy
Action
Thriller
Adventure


In [17]:
movie_tags = pd.read_csv('data/tags.csv')
movie_tags.drop(columns=['timestamp'],inplace=True)
movie_tags.drop_duplicates(inplace=True)

In [72]:
movie_tag_counts = defaultdict(list)
movie_tags_list = list(movie_tags[['movieId','tag']].values)
shuffle(movie_tags_list)
movie_genre_test = movie_tags_list[round(len(movie_tags_list) * 0.9):]
movie_tags_list = movie_tags_list[:round(len(movie_tags_list) * 0.9)]
for pair in movie_tags_list:
    if isinstance(pair[1],str):
        for tag in pair[1].lower().split():
            movie_tag_counts[pair[0]].append(tag)
def get_defaultdict_int():
    return defaultdict(int)


In [74]:
def get_defaultdict_float():
    return defaultdict(float)
category_tags = defaultdict(get_defaultdict_int)
for movie in movie_tag_counts:
    for category in movie_categories[movie]:
        for tag in movie_tag_counts[movie]:
            category_tags[category][tag] += 1
category_totals = dict()
for category in category_tags:
    total = 0
    for tag in category_tags[category]:
        total += category_tags[category][tag]
    category_totals[category] = total

category_tf_scores = defaultdict(get_defaultdict_float)
for category in category_tags:
    for tag in category_tags[category]:
        category_tf_scores[category][tag] = category_tags[category][tag] / category_totals[category]
unique_tags = set()

for category in category_tf_scores:
    for tag in category_tf_scores[category]:
        unique_tags.add(tag)
tag_idf_scores = defaultdict(float)

for tag in list(unique_tags):
    doc_count = 0
    for category in category_tf_scores:
        if tag in category_tf_scores[category]:
            doc_count += 1
    tag_idf_scores[tag] = log(len(category_tf_scores)/doc_count)
    
tag_tfidf_scores = defaultdict(get_defaultdict_float)
for category in category_tags:
    for tag in list(unique_tags):
        tag_tfidf_scores[category][tag] = category_tf_scores[category][tag] * tag_idf_scores[tag]

for cat in top_categories:
    tags = list(tag_tfidf_scores[cat].items())
    tags.sort(key=operator.itemgetter(1))
    tags.reverse()
    print(cat)
    print(tags[:10])

Drama
[('tarantino', 0.00059089215435438), ('leonardo', 0.0005417287461854934), ('dicaprio', 0.0005183877280632727), ('quentin', 0.00042556646994932555), ('hanks', 0.00034473563954794367), ('austen', 0.00034286091881108045), ('coen', 0.0003163916033282854), ('pacino', 0.00031218158481685653), ('notable)', 0.00030522335278587655), ('gosling', 0.0003018514243335284)]
Comedy
[('stand-up', 0.0010566158396847494), ('pixar', 0.0008851113789571215), ('tarantino', 0.0006327954862111999), ('coen', 0.0005976872354483472), ('monty', 0.0005496367140628084), ('sandler', 0.00047540180106926005), ('carell', 0.000442865847858042), ('quentin', 0.0004257247124737833), ('carrey', 0.00041719979985951504), ('wes', 0.00039998321823118244)]
Action
[('marvel', 0.0010784985691227554), ('tarantino', 0.0010052114596156702), ('quentin', 0.0007471630973130396), ('007', 0.0005924317018984574), ('wars', 0.0005116860839088302), ('scifi', 0.0005090445464696488), ('schwarzenegger', 0.0004995198044359131), ('mcu', 0.000

In [75]:
movie_tags_list[0]

array([6120, 'campy'], dtype=object)