## Predicting movie genre from its title

In [90]:
import numpy, pandas as pd
import sklearn

In [370]:
data = pd.read_csv('movies.csv',quotechar='"')

In [348]:
data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### drop non ascii titles

In [371]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

data = data.drop(data[data['title'].apply(lambda t: not is_ascii(t))].index)


### Processing title 
We strip away numbers, parenthesis... etc

In [372]:
import re
def process_title(title): 
    # strip away numbers and parenthesis
    title = title.replace('(','').replace(')','')
    title = re.sub(r'\d+','',title)
    # strip away "part" word
    title = re.sub(r'[Pp]art','',title)
    #strip II and III and IV
    title = title.replace('II','').replace('III','').replace('IV','')
    return title

data['title'] = data['title'].apply(process_title) 
#drop empty titles
data = data.drop(data[data['title'].str.strip() ==''].index)

### Converting to binary classification
This is a multilabel classification problem, we will convert it to set of binary classification problems 

In [391]:
# drop movies with no genres
data['genres'] = data['genres'].apply(lambda gs:gs.lower())

# get all genres
genres = set()
for gs in data['genres'].str.split('|'):
    genres |= set(gs)
genres.remove('(no genres listed)')

for g in genres:
    data[g] = data['genres'].apply(lambda gs: 1 if g in gs.split('|') else 0)

In [376]:
data.head()

Unnamed: 0,movieId,title,genres,sci-fi,horror,fantasy,adventure,western,musical,children,...,romance,film-noir,crime,drama,animation,action,comedy,documentary,war,imax
0,1,Toy Story,adventure|animation|children|comedy|fantasy,0,0,1,1,0,0,1,...,0,0,0,0,1,0,1,0,0,0
1,2,Jumanji,adventure|children|fantasy,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,comedy|romance,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
3,4,Waiting to Exhale,comedy|drama|romance,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,0,0,0
4,5,Father of the Bride,comedy,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [380]:
d = dict(data.mean())
del d['movieId']

In [392]:
#sorting genres by frequency occurence
g_sorted_freq = sorted(d.keys(),key=lambda x:d[x])

# dropping the 6 least common genres
for g in g_sorted_freq[:6]:
    data = data.drop(g,axis=1)
    genres.remove(g)

### Converting to lower case 

In [387]:
data['title']=data['title'].apply(lambda t: t.lower())

In [388]:
data.head()

Unnamed: 0,movieId,title,genres,sci-fi,horror,fantasy,adventure,thriller,mystery,romance,crime,drama,action,comedy,documentary,war
0,1,toy story,adventure|animation|children|comedy|fantasy,0,0,1,1,0,0,0,0,0,0,1,0,0
1,2,jumanji,adventure|children|fantasy,0,0,1,1,0,0,0,0,0,0,0,0,0
2,3,grumpier old men,comedy|romance,0,0,0,0,0,0,1,0,0,0,1,0,0
3,4,waiting to exhale,comedy|drama|romance,0,0,0,0,0,0,1,0,1,0,1,0,0
4,5,father of the bride,comedy,0,0,0,0,0,0,0,0,0,0,1,0,0


## Treating it as text classification using Naive Bayes

In [394]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, train_size = 0.6)

In [285]:
from collections import defaultdict
from nltk.tokenize import word_tokenize
genres = train['main_genre'].unique()

# genre probability


p_genres = {}
count_genres ={}
for g in genres:
    count_genres[g] = len(train[train['main_genre'] == g].index)
    p_genres[g] = count_genres[g]*1.0 / len(train.index)

prob_word_given_genre = defaultdict(lambda: defaultdict(lambda :0))

#compute counts onlu
for i,row in train.iterrows():
    words_in_title = word_tokenize(row['title'])
    for word in words_in_title:
        prob_word_given_genre[word][row['main_genre']]+=1.0



In [286]:
import math
K = 0.05 # smoothing factor
V = len(prob_word_given_genre) # vocabulary size

def get_prob_genre_given_word(genre, word):
    return ( (K + prob_word_given_genre[word][genre]) * p_genres[genre]) / (count_genres[g] + K*V)
    

def get_genre_for_title(title):
    title = title.lower()
    probs = defaultdict(lambda :0.0)
    for w in word_tokenize(title):
        for g in genres:
            probs[g] += math.log(get_prob_genre_given_word(g,w))
    
    return max(probs.keys(),key=lambda k:probs[k])

### Testing accuracy


In [287]:
from sklearn.metrics import accuracy_score, classification_report
y_pred = test['title'].apply(get_genre_for_title)

In [288]:
accuracy_score(test['main_genre'],y_pred)
print classification_report(test['main_genre'],y_pred)

             precision    recall  f1-score   support

     action       0.53      0.14      0.23      1711
  adventure       0.36      0.02      0.04       593
  animation       0.22      0.01      0.02       247
   children       0.00      0.00      0.00       208
     comedy       0.37      0.41      0.39      3196
      crime       0.14      0.01      0.01       781
documentary       0.12      0.01      0.02      1144
      drama       0.32      0.76      0.45      3619
    fantasy       0.00      0.00      0.00        53
  film-noir       0.00      0.00      0.00        16
     horror       0.50      0.04      0.08       676
    musical       0.00      0.00      0.00        45
    mystery       0.00      0.00      0.00        71
    romance       0.00      0.00      0.00        68
     sci-fi       0.00      0.00      0.00        85
   thriller       0.00      0.00      0.00       132
        war       0.00      0.00      0.00        12
    western       0.00      0.00      0.00   

### Classification using word embeddings

In [396]:
# glove word embeddings
import numpy as np

embeddings = {}
with open('glove.6B/glove.6B.50d.txt', 'r') as f:
    for line in f:
        embeddings[line.split()[0]] = np.array(map(float, line.split()[1:]))

In [434]:
# transform text (a title) to an embedding by averaging word embeddings
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def get_mean_embeddings(docs,embeddings):
    means = []
    dim = len(embeddings.values()[0])
    for doc in docs :
        words = tokenizer.tokenize(doc)
        means.append(np.mean([embeddings[w] if w in embeddings else np.zeros(dim) for w in words], axis=0)) 
    return np.array(means)

In [448]:
def get_mean_embeddings(docs,embeddings):
    dim = len(embeddings.values()[0])
    return np.array([
                np.mean([embeddings[w]
                         for w in tokenizer.tokenize(doc) if w in embeddings] or
                        [np.zeros(dim)], axis=0)
                for doc in docs
            ])

In [449]:
train_feature_matrix = get_mean_embeddings(train['title'],embeddings)
test_feature_matrix = get_mean_embeddings(test['title'],embeddings)

In [451]:
len(train_feature_matrix)

15451

In [458]:
import sklearn.svm as svm

clf = svm.SVC(kernel='linear')
for g in genres:
    clf.fit(train_feature_matrix,train[g])
    y_pred = clf.predict(test_feature_matrix)
    print 'for genre ', g
    print classification_report(test[g],y_pred)
    

for genre  sci-fi
             precision    recall  f1-score   support

          0       0.93      1.00      0.97      9626
          1       0.00      0.00      0.00       675

avg / total       0.87      0.93      0.90     10301

for genre  horror
             precision    recall  f1-score   support

          0       0.90      1.00      0.95      9306
          1       0.00      0.00      0.00       995

avg / total       0.82      0.90      0.86     10301

for genre  fantasy
             precision    recall  f1-score   support

          0       0.95      1.00      0.97      9783
          1       0.00      0.00      0.00       518

avg / total       0.90      0.95      0.93     10301

for genre  adventure
             precision    recall  f1-score   support

          0       0.91      1.00      0.95      9401
          1       0.00      0.00      0.00       900

avg / total       0.83      0.91      0.87     10301

for genre  thriller
             precision    recall  f1-score  

In [456]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
for g in genres:
    clf.fit(train_feature_matrix,train[g])
    y_pred = clf.predict(test_feature_matrix)
    print 'for "%s", f1 score = %.2f' %(g,accuracy_score(test[g],y_pred))

for "sci-fi", classification accuracy = 0.93
for "horror", classification accuracy = 0.90
for "fantasy", classification accuracy = 0.95
for "adventure", classification accuracy = 0.91
for "thriller", classification accuracy = 0.84
for "mystery", classification accuracy = 0.94
for "romance", classification accuracy = 0.85
for "crime", classification accuracy = 0.89
for "drama", classification accuracy = 0.58
for "action", classification accuracy = 0.87
for "comedy", classification accuracy = 0.70
for "documentary", classification accuracy = 0.91
for "war", classification accuracy = 0.96


             precision    recall  f1-score   support

     action       0.38      0.25      0.31      1711
  adventure       0.00      0.00      0.00       593
  animation       0.00      0.00      0.00       247
   children       0.00      0.00      0.00       208
     comedy       0.38      0.47      0.42      3196
      crime       0.00      0.00      0.00       781
documentary       0.00      0.00      0.00      1144
      drama       0.33      0.70      0.45      3619
    fantasy       0.00      0.00      0.00        53
  film-noir       0.00      0.00      0.00        16
     horror       0.00      0.00      0.00       676
    musical       0.00      0.00      0.00        45
    mystery       0.00      0.00      0.00        71
    romance       0.00      0.00      0.00        68
     sci-fi       0.00      0.00      0.00        85
   thriller       0.00      0.00      0.00       132
        war       0.00      0.00      0.00        12
    western       0.00      0.00      0.00   

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [342]:
y_pred = logReg.predict(test_feature_matrix)
print classification_report(test['main_genre'],y_pred)

             precision    recall  f1-score   support

     action       0.36      0.27      0.31      1711
  adventure       0.15      0.02      0.03       593
  animation       0.50      0.00      0.01       247
   children       0.00      0.00      0.00       208
     comedy       0.37      0.48      0.42      3196
      crime       0.20      0.01      0.02       781
documentary       0.32      0.01      0.02      1144
      drama       0.33      0.65      0.44      3619
    fantasy       0.00      0.00      0.00        53
  film-noir       0.00      0.00      0.00        16
     horror       0.30      0.06      0.09       676
    musical       0.00      0.00      0.00        45
    mystery       0.00      0.00      0.00        71
    romance       0.00      0.00      0.00        68
     sci-fi       0.00      0.00      0.00        85
   thriller       0.00      0.00      0.00       132
        war       0.00      0.00      0.00        12
    western       0.00      0.00      0.00   

In [461]:
data2 = pd.read_csv('imdb.csv',quotechar='"',error_bad_lines=False)

Skipping line 66: expected 44 fields, saw 46
Skipping line 111: expected 44 fields, saw 45
Skipping line 198: expected 44 fields, saw 45
Skipping line 222: expected 44 fields, saw 46
Skipping line 278: expected 44 fields, saw 45
Skipping line 396: expected 44 fields, saw 45
Skipping line 403: expected 44 fields, saw 45
Skipping line 421: expected 44 fields, saw 45
Skipping line 437: expected 44 fields, saw 45
Skipping line 462: expected 44 fields, saw 46
Skipping line 491: expected 44 fields, saw 45
Skipping line 515: expected 44 fields, saw 45
Skipping line 529: expected 44 fields, saw 45
Skipping line 530: expected 44 fields, saw 45
Skipping line 558: expected 44 fields, saw 45
Skipping line 623: expected 44 fields, saw 45
Skipping line 646: expected 44 fields, saw 45
Skipping line 663: expected 44 fields, saw 46
Skipping line 713: expected 44 fields, saw 45
Skipping line 730: expected 44 fields, saw 47
Skipping line 791: expected 44 fields, saw 45
Skipping line 813: expected 44 fiel

In [466]:
data2.head()

Unnamed: 0,fn,tid,title,wordsInTitle,url,imdbRating,ratingCount,duration,year,type,...,News,RealityTV,Romance,SciFi,Short,Sport,TalkShow,Thriller,War,Western
0,titles01/tt0012349,tt0012349,Der Vagabund und das Kind (1921),der vagabund und das kind,http://www.imdb.com/title/tt0012349/,8.4,40550.0,3240.0,1921.0,video.movie,...,0,0,0,0,0,0,0,0,0,0
1,titles01/tt0015864,tt0015864,Goldrausch (1925),goldrausch,http://www.imdb.com/title/tt0015864/,8.3,45319.0,5700.0,1925.0,video.movie,...,0,0,0,0,0,0,0,0,0,0
2,titles01/tt0017136,tt0017136,Metropolis (1927),metropolis,http://www.imdb.com/title/tt0017136/,8.4,81007.0,9180.0,1927.0,video.movie,...,0,0,0,1,0,0,0,0,0,0
3,titles01/tt0017925,tt0017925,Der General (1926),der general,http://www.imdb.com/title/tt0017925/,8.3,37521.0,6420.0,1926.0,video.movie,...,0,0,0,0,0,0,0,0,0,0
4,titles01/tt0021749,tt0021749,Lichter der Großstadt (1931),lichter der gro stadt,http://www.imdb.com/title/tt0021749/,8.7,70057.0,5220.0,1931.0,video.movie,...,0,0,1,0,0,0,0,0,0,0
