In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import operator
from scipy import spatial
from math import log
from random import shuffle
from sklearn.model_selection import train_test_split
import string
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [2]:
movie_data = pd.read_csv('data/movies.csv')
# dropped = movie_data[movie_data.genres == '(no genres listed)']
# dropped = dropped[['movieId']].values
movie_data = movie_data[movie_data.genres != '(no genres listed)']
movie_id_list = movie_data[['movieId']].values
movies = movie_data['movieId'].unique().tolist()
print('Number of unique movies in the dataset: {}\n'.format(len(movies)))

genres = movie_data['genres'].unique().tolist()
unique_genres = set()
for genre_list in genres:
    sp = genre_list.split('|')
    for gen in sp:
        unique_genres.add(gen)
print('List of possible genres in the dataset:')
for genre in sorted(unique_genres)[:len(unique_genres)-1]:
    print(genre, end=', ')
print(sorted(unique_genres)[len(unique_genres)-1])
    
rating_data = pd.read_csv('data/ratings.csv')
rating_data = rating_data[rating_data.movieId.isin(movie_id_list)]
unique_users = rating_data['userId'].unique().tolist()
print('\n\nNumber of users in the dataset: {}'.format(len(unique_users)))
print('Number of ratings in the dataset: {}'.format(len(rating_data['userId'].tolist())))

Number of unique movies in the dataset: 53832

List of possible genres in the dataset:
Action, Adventure, Animation, Children, Comedy, Crime, Documentary, Drama, Fantasy, Film-Noir, Horror, IMAX, Musical, Mystery, Romance, Sci-Fi, Thriller, War, Western


Number of users in the dataset: 283220
Number of ratings in the dataset: 27735055


In [3]:
movie_categories = dict()

id_genres = movie_data[['movieId','genres']].values
for pair in id_genres:
    movie_categories[pair[0]] = pair[1].split('|')

rating_movies = rating_data[['movieId']].values
category_counts = defaultdict(int)
for movie in rating_movies:
    for cat in movie_categories[movie[0]]:
        category_counts[cat] += 1
category_counts_list = list(category_counts.items())
category_counts_list.sort(key=operator.itemgetter(1))
category_counts_list.reverse()
print('Top 5 most watched movie categories are:')
top_categories = []
for i in range(5):
    top_categories.append(category_counts_list[i][0])
    print('{}'.format(category_counts_list[i][0]))

Top 5 most watched movie categories are:
Drama
Comedy
Action
Thriller
Adventure


In [4]:
movie_tags = pd.read_csv('data/tags.csv')
movie_tags = movie_tags[movie_tags.movieId.isin(movie_id_list)]
movie_tags.drop(columns=['timestamp'],inplace=True)
movie_tags.drop_duplicates(inplace=True)

In [5]:
movie_tag_counts = defaultdict(list)
movie_tags_list = list(movie_tags[['movieId','tag']].values)
translator = str.maketrans('','',string.punctuation)
for pair in movie_tags_list:
    if isinstance(pair[1],str):
        for tag in (pair[1].translate(translator)).lower().split():
            movie_tag_counts[pair[0]].append(tag)


In [6]:
def get_defaultdict_int():
    return defaultdict(int)
def get_defaultdict_float():
    return defaultdict(float)
category_tags = defaultdict(get_defaultdict_int)
for movie in movie_tag_counts:
    for category in movie_categories[movie]:
        for tag in movie_tag_counts[movie]:
            category_tags[category][tag] += 1
category_totals = dict()
for category in category_tags:
    total = 0
    for tag in category_tags[category]:
        total += category_tags[category][tag]
    category_totals[category] = total

category_tf_scores = defaultdict(get_defaultdict_float)
for category in category_tags:
    for tag in category_tags[category]:
        category_tf_scores[category][tag] = category_tags[category][tag] / category_totals[category]
unique_tags = set()

for category in category_tf_scores:
    for tag in category_tf_scores[category]:
        unique_tags.add(tag)
tag_idf_scores = defaultdict(float)

for tag in list(unique_tags):
    doc_count = 0
    for category in category_tf_scores:
        if tag in category_tf_scores[category]:
            doc_count += 1
    tag_idf_scores[tag] = log(len(category_tf_scores)/doc_count)
    
tag_tfidf_scores = defaultdict(get_defaultdict_float)
for category in category_tags:
    for tag in list(unique_tags):
        tag_tfidf_scores[category][tag] = category_tf_scores[category][tag] * tag_idf_scores[tag]

for cat in top_categories:
    tags = list(tag_tfidf_scores[cat].items())
    tags.sort(key=operator.itemgetter(1))
    tags.reverse()
    print(cat)
    print(tags[:10])

Drama
[('tarantino', 0.0005313980353646588), ('leonardo', 0.00047551521625585174), ('dicaprio', 0.0004539401156997786), ('quentin', 0.0003754067496756724), ('austen', 0.00032373409988695494), ('coen', 0.0002902182366877576), ('gosling', 0.0002854745731082802), ('hanks', 0.00028518400286328276), ('pacino', 0.0002847708252445812), ('turing', 0.0002746138159434009)]
Comedy
[('standup', 0.0012291886139587668), ('pixar', 0.0007858312302732796), ('tarantino', 0.0005677095804373472), ('coen', 0.0005505858676349869), ('monty', 0.0005432770288008281), ('carell', 0.00043638911735652996), ('sandler', 0.0004259676671414713), ('quentin', 0.000372856860009544), ('wes', 0.0003563378851989946), ('aardman', 0.0003177782046419086)]
Action
[('tarantino', 0.0009066624124239697), ('marvel', 0.0009013881402910682), ('quentin', 0.0006596618932898785), ('007', 0.0005625692472049567), ('wars', 0.0005423368843254071), ('mcu', 0.00046991078295943435), ('space', 0.0004499276286524986), ('schwarzenegger', 0.000446

In [27]:
all_categories = [category[0] for category in category_counts_list]
check_words = set()
while len(check_words) < 1100:
    for cat in top_categories:
        tags = list(tag_tfidf_scores[cat].items())
        tags.sort(key=operator.itemgetter(1))
        tags.reverse()
        initial = len(check_words)
        i = 0
        while (len(check_words) - initial < 10):
            check_words.add(tags[i][0])
            i += 1
check_words = list(check_words)
print('Got words')
movie_tag_pairs = movie_tags[['movieId','tag']].values
train,test = train_test_split(movie_tag_pairs,shuffle=True)

Got words


In [None]:
def get_features(data,train=True):
    X = []
    y = []
    genre_list = []
    for datum in data:
        feature = []
        feature.append(1)
        movie_id = datum[0]
        tags = (str(datum[1]).translate(translator)).lower()
        for word in check_words:
            feature.append(word in tags)
        genres = movie_categories[movie_id]
        if not train:
            X.append(feature)
            genre_list.append(genres)
        for genre in genres:
            if train:
                X.append(feature)
            y.append(all_categories.index(genre))
    if train:
        return X,y
    else:
        return X,genre_list

X_train, y_train = get_features(train)
X_test, genres = get_features(test,False)

In [None]:
ovr = OneVsRestClassifier(LinearSVC())
ovr.fit(X_train,y_train)

In [None]:
correct = 0
predictions = ovr.predict(X_test)
for pred,genre_list in zip(predictions,genres):
    if all_categories[pred] in genre_list:
        correct += 1
print('Accuracy of tag SVM: {}'.format(correct/len(predictions)))

In [None]:
correct = 0
predictions = dict()
for pair in movie_tag_pairs:
    predictions[pair[0]] = top_categories[0]
for movie in predictions:
    if predictions[movie] in movie_categories[movie]:
        correct += 1
print('Accuracy of naive classifier: {}'.format(correct/len(predictions)))

In [64]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import keras
from keras import backend as K
print(K.tensorflow_backend._get_available_gpus())
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import *
import numpy as np

['/job:localhost/replica:0/task:0/device:GPU:0']


In [28]:
def get_features(data,train=True):
    X = []
    y = []
    genre_list = []
    for datum in data:
        feature = []
        movie_id = datum[0]
        tags = (str(datum[1]).translate(translator)).lower()
        for word in check_words:
            feature.append(word in tags)
        genres = movie_categories[movie_id]
        if not train:
            X.append(feature)
            genre_list.append(genres)
        for genre in genres:
            if train:
                X.append(feature)
            y.append(all_categories.index(genre))
    if train:
        return X,y
    else:
        return X,genre_list
    
X_train, y_train = get_features(train,False)

y = np.zeros((len(y_train),len(all_categories)))
index = 0

for cats in y_train:
    feat = [0] * len(all_categories)
    for cat in cats:
        feat[all_categories.index(cat)] = 1
    y[index] = feat
    index += 1
X = np.asarray(X_train)

In [30]:
X_, y_ = get_features(test,False)
X_test = np.asarray(X_)

In [68]:
def generate_model(X, y, params):
    model = Sequential()
    model.add(Dense(params['layer1'], activation=params['activation'], input_dim=X.shape[1]))
    model.add(Dropout(params['dropout']))
    model.add(Dense(params['layer2'], activation=params['activation']))
    model.add(Dropout(params['dropout']))
    model.add(Dense(y.shape[1], activation='sigmoid'))
    model.compile(loss=params['loss'],
                  optimizer=params['optimizer'],
                  metrics=['categorical_accuracy'])

    model.fit(X, y, epochs=params['epochs'], batch_size=params['batch_size'])
    
    preds = model.predict(X_test)
    correct = 0
    index = 0
    for real in y_:
        if all_categories[np.argmax(preds[index])] in real:
            correct += 1
        index += 1
    print('Accuracy: {}'.format(correct/len(y_)))
    return model,(correct/len(y_))

In [77]:
optimizers = [Nadam(),RMSprop(),SGD()]
layer1 = [500,700,1000,1500]
layer2 = [300,400,800]
loss = ['logcosh','binary_crossentropy']
activation = ['relu', 'elu']
epochs = [1,2,3,4]
dropout = [0.1,0.5,0.9]
batch_size = [2000,10000]

In [None]:
results = []
for opt in optimizers:
    for l1 in layer1:
        for l2 in layer2:
            for ls in loss:
                for act in activation:
                    for ep in epochs:
                        for drop in dropout:
                            for batch in batch_size:
                                param = {'optimizer':opt,
                                 'layer1':l1,
                                 'layer2':l2,
                                 'loss':ls,
                                 'activation':act,
                                 'epochs':ep,
                                 'dropout':drop,
                                 'batch_size':batch}
                                model, acc = generate_model(X,y,param)
                                results.append((model,acc,param))

Epoch 1/1
Accuracy: 0.5898091392383
Epoch 1/1
Accuracy: 0.5859345495744125
Epoch 1/1
Accuracy: 0.5884038288353717
Epoch 1/1
Accuracy: 0.5825610783487785
Epoch 1/1
Accuracy: 0.5473121168985853
Epoch 1/1
Accuracy: 0.5357972866978473
Epoch 1/2
Epoch 2/2
Accuracy: 0.5910002033524098
Epoch 1/2
Epoch 2/2
Accuracy: 0.5901323243180432
Epoch 1/2
Epoch 2/2
Accuracy: 0.5894786915724951
Epoch 1/2
Epoch 2/2
Accuracy: 0.5890501989948581
Epoch 1/2
Epoch 2/2
Accuracy: 0.5513101705255208
Epoch 1/2
Epoch 2/2
Accuracy: 0.5464224501060337
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.591301600673968
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5911309299015194
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5901395869041048
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5894278534700637
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5603992969816692
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5604319786189467
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 0.5909602591290707
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 0.5

Accuracy: 0.5874052232518955
Epoch 1/1
Accuracy: 0.5679814368300264
Epoch 1/1
Accuracy: 0.5545383900299219
Epoch 1/2
Epoch 2/2
Accuracy: 0.5888177962408855
Epoch 1/2
Epoch 2/2
Accuracy: 0.5802661011532987
Epoch 1/2
Epoch 2/2
Accuracy: 0.5881641634953374
Epoch 1/2
Epoch 2/2
Accuracy: 0.5888831595154402
Epoch 1/2
Epoch 2/2
Accuracy: 0.571856026493914
Epoch 1/2
Epoch 2/2
Accuracy: 0.568871103622578
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.590237631815937
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5877574586758854
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5893842779536937
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5888541091711936
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5784214042936409
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 0.5710716671992563
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 0.5897219882055602
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 0.5894786915724951
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Accuracy: 0.5894496412282485
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
A