In [92]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import *
import json
import numpy as np
import string
from datetime import *
import re
import seaborn as sns

In [26]:
d1 = pd.read_json('all_movies.json').T
d2 = pd.read_json('movies_3k.json').T
d3 = pd.read_json('movies_6k.json').T
d4 = pd.read_json('movies_9k.json').T
#Transpose so that all features are placed as column headers

In [30]:
df = pd.concat([d1,d2,d3,d4])
df.shape

(21331, 14)

In [31]:
df.drop_duplicates(subset= ['title'],inplace = True)

In [32]:
df.shape

(9930, 14)

In [33]:
df.isna().sum()

title              1
release           35
popularity         1
budget             1
genres             1
overview           1
language           1
votes              1
status             1
cast               1
director          73
screenwriter    8112
producer        1793
music           9436
dtype: int64

In [34]:
df.dropna(subset=['title', 'release'], inplace = True)

In [35]:
df.isna().sum()

title              0
release            0
popularity         0
budget             0
genres             0
overview           0
language           0
votes              0
status             0
cast               0
director          49
screenwriter    8079
producer        1766
music           9401
dtype: int64

In [36]:
df[['release', 'votes']] = df[['release', 'votes']].astype('int64')
df[['popularity', 'budget']] = df[['popularity','budget']].astype('float64')

In [37]:
df.drop_duplicates(subset=['title'], ignore_index=True, inplace=True)
# Drop instances where title is duplicated. Result of bad choice in search query
df.set_index('title', inplace= True)

In [38]:
df = df[np.logical_not(df['director'].isna() & df['screenwriter'].isna() & df['producer'].isna() & df['music'].isna())]
#Filtered out movies whose director, screenwriter, producer and music director are all not available
df = df[np.logical_not(df['screenwriter'].isna() & df['producer'].isna() & df['music'].isna())]

In [39]:
df = df.dropna(subset=['director'])

In [40]:
df.groupby('status').count()

Unnamed: 0_level_0,release,popularity,budget,genres,overview,language,votes,cast,director,screenwriter,producer,music
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Planned,2,2,2,2,2,2,2,2,2,0,2,0
Post Production,3,3,3,3,3,3,3,3,3,0,3,0
Released,8228,8228,8228,8228,8228,8228,8228,8228,8228,1816,8120,494


In [41]:
df = df[df['status'] == 'Released']

In [44]:
def isEmpty(lst):
    if lst == []:
        return True
    return False

In [45]:
exists = []
for cast in df['cast']:
    exists.append(isEmpty(cast))
exists = np.logical_not(exists)
#Movies for which the cast has not been recorded

In [46]:
df = df[exists]

In [47]:
df.head()

Unnamed: 0_level_0,release,popularity,budget,genres,overview,language,votes,status,cast,director,screenwriter,producer,music
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
All About Eve,1950,18.255,1400000.0,[Drama],From the moment she glimpses her idol at the s...,en,879,Released,"[Bette Davis, Anne Baxter, George Sanders]",Joseph L. Mankiewicz,Joseph L. Mankiewicz,Darryl F. Zanuck,Urban Thielmann
Sunset Boulevard,1950,16.261,1752000.0,[Drama],A hack screenwriter writes a screenplay for a ...,en,1396,Released,"[William Holden, Gloria Swanson, Erich von Str...",Billy Wilder,Billy Wilder,Charles Brackett,
In a Lonely Place,1950,9.921,0.0,"[Mystery, Thriller]",An aspiring actress begins to suspect that her...,en,269,Released,"[Humphrey Bogart, Gloria Grahame, Frank Lovejoy]",Nicholas Ray,Andrew Solt,Robert Lord,
A Streetcar Named Desire,1951,13.575,1800000.0,[Drama],Disturbed Blanche DuBois moves in with her sis...,en,744,Released,"[Vivien Leigh, Marlon Brando, Kim Hunter]",Elia Kazan,Tennessee Williams,Charles K. Feldman,
An American in Paris,1951,11.251,2723903.0,"[Comedy, Drama, Music, Romance]",Jerry Mulligan is an exuberant American expatr...,en,318,Released,"[Gene Kelly, Leslie Caron, Oscar Levant]",Vincente Minnelli,Alan Jay Lerner,Arthur Freed,George Gershwin


In [48]:
df.isna().sum()
df.drop(['screenwriter', 'music', 'producer'], 1, inplace = True)

In [49]:
def search_index(data:pd.DataFrame, query):
    bool_list = []
    for i in list(data.index):
        bool_list.append(query.lower() in i.lower())
    return data.loc[bool_list]

In [50]:
genres = []
for i, n in df.iterrows():
    genres.extend(n['genres'])

In [51]:
genre_dict = {}

In [52]:
for genre in np.unique(genres):
    genre_dict[genre] = []
    for idx, row in df.iterrows():
        genre_dict[genre].append(int(genre in row['genres']))

In [53]:
for genre in genre_dict.keys():
    df[genre] = genre_dict[genre]

In [54]:
y = df[np.unique(genres)]

In [55]:
feature_cols = ['release', 'popularity', 'budget', 'genres', 'overview', 'language',
       'votes', 'status', 'cast', 'director']

In [56]:
X = df[feature_cols]

In [100]:
for genre in np.unique(genres):
    if df.groupby(genre).count()['release'][1]/len(df) > 0.15:
        print(genre,df.groupby(genre).count()['release'][1], df.groupby(genre).count()['release'][0] )

Action 1636 6547
Comedy 3100 5083
Drama 3985 4198
Romance 1558 6625
Thriller 1863 6320


From the samples, it can be deduced that genres such as Action, Comedy, Drama, Romance and Thriller are more common than the others (i.e there are atleast 15% movies labelled under the genres). 

Next, we plot a correlation matrix between various genres to see how likely it is that if a movie is labelled with genre "X", it is also labelled with genre "Y". For the sake of convenience, we only check genres having a correlation rate of more than 25%

In [111]:
y.corr()[(y.corr()>0.25) & (y.corr() < 1.0)].dropna(how = 'all').dropna(how = 'all', axis= 1)

Unnamed: 0,Action,Adventure,Animation,Crime,Family,History,Mystery,Thriller,War
Action,,0.332386,,,,,,,
Adventure,0.332386,,,,0.280998,,,,
Animation,,,,,0.48097,,,,
Crime,,,,,,,,0.306897,
Family,,0.280998,0.48097,,,,,,
History,,,,,,,,,0.279347
Mystery,,,,,,,,0.278185,
Thriller,,,,0.306897,,,0.278185,,
War,,,,,,0.279347,,,


Some of these correlation rates make sense. Action movies are generally associated with an adventure theme, and a lot of animated movies are considered family-friendly.

Thrillers are generally associated with crime and mystery, whereas a significant number of historical films depict war

In [195]:
cols = ['Action', 'Comedy', 'Romance', 'Drama', 'Thriller']
my_dict = {}
for genre in y.columns:
    new_dict = {}
    for _genre in cols:
        if genre!= _genre:
            new_dict[_genre] = round(len(y[(y[genre] == 1) & (y[_genre] == 1)])/len(y[y[_genre] == 1]), 5)*100
            my_dict[genre] = new_dict
#             print(genre, _genre, (len(y[(y[genre] == 1) & (y[_genre] == 1)]))/len(y[y[genre] == 1]))

In [196]:
pd.DataFrame(my_dict)

Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
Comedy,13.677,12.258,6.677,,10.484,0.871,35.968,14.355,9.226,0.871,5.839,5.258,2.452,28.548,5.968,0.742,6.097,1.161,1.355
Romance,4.236,5.841,1.54,56.804,4.942,0.0,67.137,4.685,7.317,3.017,1.155,6.611,2.888,,3.081,0.578,6.226,3.081,1.091
Drama,11.543,7.353,1.656,27.98,16.412,0.527,,4.241,4.718,7.403,4.291,4.316,7.905,26.248,4.391,0.903,21.706,5.37,1.531
Thriller,37.466,12.238,0.644,10.145,34.085,0.215,46.43,0.537,4.455,1.986,21.954,0.054,20.666,5.207,14.707,0.805,,2.04,0.751
Action,,37.286,4.707,25.917,24.694,0.489,28.117,5.746,11.369,3.362,7.09,0.672,5.134,4.034,24.328,0.611,42.665,4.768,3.056


We now create a different matrix, which plots the 5 most popular genres, versus the other genres. It shows what percentage of movies of a certain genre are labelled under other genres as well. For instance, if we look at Action and Adventure, we see 37.286 % , which implies that 37% of Action movies are also labelled under Adventure. This better explains the correlation and may also uncover certain correlations such as the one between Drama and Romance: 67% of Romance movies are also labelled under Drama

In [57]:
tts = model_selection.train_test_split

In [62]:
X_train, X_test, y_train, y_test = tts(X, y, random_state = 42)

In [63]:
count_vec = feature_extraction.text.CountVectorizer

In [64]:
struct = [('c_vec', count_vec()), ('clf', linear_model.LogisticRegression(max_iter=1000))]

In [65]:
from sklearn.pipeline import Pipeline

In [66]:
pipe = Pipeline(struct, verbose = True)

In [67]:
pipe

Pipeline(steps=[('c_vec', CountVectorizer()),
                ('clf', LogisticRegression(max_iter=1000))],
         verbose=True)