In [1]:
from pykalman import KalmanFilter
import numpy as np
import pandas as pd
import sys
import matplotlib
import matplotlib.pyplot as plt
from skimage.color import lab2rgb
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import skimage
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from functools import reduce
import statsmodels.api as sm
lowess = sm.nonparametric.lowess
from scipy import stats

  from pandas.core import datetools


In [2]:
def to_timestamp(dateTime):
    return dateTime.timestamp()

def map_genre(row):
    result = []
    for genre_code in row:
        matches = genres[genres['wikidata_id'] == genre_code]['genre_label'].values
        for match in matches:
            result.append(match)
    return result

In [3]:
wikidata = pd.read_json('movies/data/wikidata-movies.json.gz', orient='record', lines=True, encoding="utf8", convert_dates=['publication_date'])
#wikidata = pd.read_json('movies/data/wikidata-movies.json.gz', orient='record', lines=True, encoding="utf8")
genres = pd.read_json('movies/data/genres.json.gz', orient='record', lines=True, encoding="utf8")

In [4]:
wikidata = wikidata[wikidata['made_profit'].notnull()].reset_index(drop=True)

In [5]:
#movies = movies.copy() #from https://stackoverflow.com/questions/31468176/setting-values-on-a-copy-of-a-slice-from-a-dataframe?rq=1
#movies['genre_names'] = movies.apply(map_genre,axis=1)
wikidata['genre_names'] = wikidata['genre'].apply(map_genre)
wikidata['publication_timestamp'] = wikidata['publication_date'].apply(to_timestamp)

In [6]:
rotten_tomatoes = pd.read_json('movies/data/rotten-tomatoes.json.gz', orient='record', lines=True)

In [None]:
#rotten_tomatoes
rotten_tomatoes.columns

Index(['audience_average', 'audience_percent', 'audience_ratings',
       'critic_average', 'critic_percent', 'imdb_id', 'rotten_tomatoes_id'],
      dtype='object')

In [None]:
omdb = pd.read_json('movies/data/omdb-data.json.gz', orient='record', lines=True)

In [None]:
#omdb

In [None]:
combined = wikidata.join(rotten_tomatoes.set_index('rotten_tomatoes_id'), on='rotten_tomatoes_id', rsuffix='_rt')

In [None]:
combined = combined.join(omdb.set_index('imdb_id'), on='imdb_id')

In [None]:
combined

In [None]:
plt.title('Popularity Distribution')
plt.xlabel('Rank')
plt.ylabel('Views')
plt.scatter(combined['critic_average'], combined['audience_average'] * 2)
plt.show()

In [None]:
test3 = combined[combined['audience_average'].notnull() & combined['critic_average'].notnull()]
print(stats.normaltest(test3['audience_average']).pvalue) #<0.05, therefore not normal
print(stats.mannwhitneyu(test3['critic_average'], test3['audience_average'] * 2).pvalue) #>0.05, therefore one distribution is higher than the other

In [None]:
chi2, p, dof, expected = stats.chi2_contingency([test3['critic_average'].values, test3['audience_average'].values])
print(p) #>0.05, therefore one has no effect on the other?
print(expected)

In [None]:
# chi2, p, dof, expected = stats.chi2_contingency([test3['critic_average'].values, test3['genre'].values])
# print(p) #>0.05, therefore one has no effect on the other?
# print(expected)

In [None]:
#combined.groupby('genre_names')
#pd.value_counts(combined.groupby('genre_names'), sort=False)
## TODO: Count distrbution of genres and graph on histogram

# Have average ratings changed over time?

In [None]:
critic_average_test = combined[['publication_date','publication_timestamp','critic_average']].dropna()
fit = stats.linregress(critic_average_test['publication_timestamp'], critic_average_test['critic_average'])
critic_average_test['prediction'] = critic_average_test['publication_timestamp']*fit.slope + fit.intercept
print(fit.pvalue) #p < 0.05, therefore we can conclude that critic ratings are decreasing.

In [None]:
plt.plot(critic_average_test['publication_date'], critic_average_test['critic_average'], 'b.', alpha=0.5)
plt.plot(critic_average_test['publication_date'], critic_average_test['prediction'], 'r-', linewidth=3)
plt.show()

In [None]:
plt.hist(np.subtract(critic_average_test['critic_average'],critic_average_test['prediction']))
plt.show()
#This is close enough to being normal.
#We expect a greater decline on the high end because the average critic rating is higher than the middle rating, 5.

In [None]:
audience_average_test = combined[['publication_date','publication_timestamp','audience_average']].dropna()
fit = stats.linregress(audience_average_test['publication_timestamp'], audience_average_test['audience_average'])
audience_average_test['prediction'] = audience_average_test['publication_timestamp']*fit.slope + fit.intercept
print(fit.pvalue) #p > 0.05, therefore we cannot conclude that the audience ratings are changing.

In [None]:
plt.plot(audience_average_test['publication_date'], audience_average_test['audience_average'], 'b.', alpha=0.5)
plt.plot(audience_average_test['publication_date'], audience_average_test['prediction'], 'r-', linewidth=3)
plt.show()

In [None]:
plt.hist(np.subtract(audience_average_test['audience_average'],audience_average_test['prediction']))
plt.show()
#This is close enough to being normal.
#We expect a greater decline on the high end because the average audience rating is higher than the middle rating, 5.

# Do average audience ratings change based on its popularity?

In [None]:
audience_ratings_test = combined[['publication_date','publication_timestamp','audience_average','audience_ratings']].dropna()
#Removing movies with n >= 10000000 ratings as they seem like outliers
audience_ratings_test = audience_ratings_test[audience_ratings_test['audience_ratings'] < 10000000]
fit = stats.linregress(audience_ratings_test['audience_ratings'], audience_ratings_test['audience_average'])
audience_ratings_test['prediction'] = audience_ratings_test['audience_ratings']*fit.slope + fit.intercept
print(fit.pvalue) #p < 0.05, therefore we can conclude that higher averages correlate with more popular movies.

In [None]:
plt.plot(audience_ratings_test['audience_ratings'], audience_ratings_test['audience_average'], 'b.', alpha=0.5)
plt.plot(audience_ratings_test['audience_ratings'], audience_ratings_test['prediction'], 'r-', linewidth=3)
plt.show()

In [None]:
plt.hist(np.subtract(audience_ratings_test['audience_average'],audience_ratings_test['prediction']))
plt.show()
#This is close enough to being normal.
#We expect a greater decline on the high end because the average audience rating is higher than the middle rating, 5.