# Data Analysis of Movies by Rating Relative to Age

In [292]:
from IPython.core.interactiveshell import InteractiveShell
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import datetime as dt
import numpy as np
import pandas as pd
import re

In [220]:
 InteractiveShell.ast_node_interactivity = "all" #Pretty-prints output function calls without need for print()

In [174]:
# Data source: https://grouplens.org/datasets/movielens/20m/
# Successfully matched MD5 checksum with reference provided (data verified)

moviesMstr = pd.read_csv("/Users/Levient/Movie_Ratings/ml-20m/movies.csv")
ratingsMstr = pd.read_csv("/Users/Levient/Movie_Ratings/ml-20m/ratings.csv")

In [223]:
# All rows for 'movieId' and 'title(year)'

movies = moviesMstr[["movieId", "title"]]

# All rows for 'movieId' and 'rating'

ratings = ratingsMstr[["movieId", "rating"]]

movies.head()
ratings.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


Unnamed: 0,movieId,rating
0,2,3.5
1,29,3.5
2,32,3.5
3,47,3.5
4,50,3.5


In [224]:
# Average all ratings per movie and add 'average' column

# The below ratings.loc function yields a 'chained_assignment' warning (not an error), despite use of df.loc[]. 
# The issue should probably be revisited: https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas

pd.options.mode.chained_assignment = None  # default='warn'

ratings.loc[:,'average'] = ratings.groupby('movieId')['rating'].transform('mean')

print('Average Ratings for All Movies')
ratings.head()

Average Ratings for All Movies


Unnamed: 0,movieId,rating,average
0,2,3.5,3.211977
1,29,3.5,3.95223
2,32,3.5,3.898055
3,47,3.5,4.053493
4,50,3.5,4.334372


In [227]:
# Remove duplicate ratings rows from ratings, drop rating column

print('Before\n', ratings.shape)

average_ratings = ratings.drop_duplicates(subset=['movieId', 'average']).drop(columns=['rating'])

print('After\n', average_ratings.shape)
print('Total movies\n', movies.shape)
print('Averages')
average_ratings.head()

# Issue to explore: There are more movies than there are average reviews. 

Before
 (20000263, 3)
After
 (26744, 2)
Total movies
 (27278, 2)
Averages


Unnamed: 0,movieId,average
0,2,3.211977
1,29,3.95223
2,32,3.898055
3,47,4.053493
4,50,4.334372


In [178]:
# Find median (and mean) of all ratings

ratings_median = ratings["average"].median()
ratings_mean = ratings["average"].mean()

print("Median of Average Ratings\n", ratings_median, "\nMean of Average Ratings\n", ratings_mean)

Median of Average Ratings
 3.601982097186701 
Mean of Average Ratings
 3.5255285642995635


In [212]:
# The classify_age function extracts a year in parens from 'title(year)' and returns a string (e.g. 'old') 

def classify_age(row):
    year = extract_year(row)
    return assign_age(year)

def extract_year(row):
  year_in_parens = re.compile('(?<=\()(\d{4})(?=\))')
  result = year_in_parens.search(row['title'])
  if result:
    return result.group(0)
  else: 
    return None
  
def assign_age(year):
    if year == None:
        return "unknown"
    year = dt.datetime(int(year), 1, 1)
    if year < dt.datetime(1970, 1, 1):
        return "old"
    elif year < dt.datetime(1990, 1, 1):
        return "medium"
    else:
        return "new"

In [235]:
# Applying classify_age returns a new column 'age' with 4 classes: 'old', 'medium', 'new' and 'unknown'

movies['age'] = movies.apply(classify_age, axis=1)

movies.tail()

Unnamed: 0,movieId,title,age
27273,131254,Kein Bund für's Leben (2007),new
27274,131256,"Feuer, Eis & Dosenbier (2002)",new
27275,131258,The Pirates (2014),new
27276,131260,Rentun Ruusu (2001),new
27277,131262,Innocence (2014),new


In [244]:
# Calculate number and percentage of each age

age_counts = movies.groupby('age')['title'].count()
percentages = age_counts.groupby(level=0).apply(lambda x: 100 * x / 26744)

print('Totals Counts of Each Age\n', age_counts)
print('\nPercentages of Each Age\n', percentages)

Totals Counts of Each Age
 age
medium      4674
new        17118
old         5464
unknown       22
Name: title, dtype: int64

Percentages of Each Age
 age
medium     17.476817
new        64.006880
old        20.430751
unknown     0.082261
Name: title, dtype: float64


In [278]:
# Inner join movies and ratings by movieId. Validate one-to-one relationship.

movie_ratings = pd.merge(movies, average_ratings, on="movieId", validate="1:1")

movie_ratings.head()

Unnamed: 0,movieId,title,age,average
0,1,Toy Story (1995),new,3.92124
1,2,Jumanji (1995),new,3.211977
2,3,Grumpier Old Men (1995),new,3.15104
3,4,Waiting to Exhale (1995),new,2.861393
4,5,Father of the Bride Part II (1995),new,3.064592


In [239]:
# Sorting movies by average rating

movie_ratings = movie_ratings.sort_values(by=['average'], ascending=False)

movie_ratings.head()
movie_ratings.tail()

Unnamed: 0,movieId,title,age,average
19152,95517,"Barchester Chronicles, The (1982)",medium,5.0
21842,105846,Only Daughter (2013),new,5.0
17703,89133,Boys (Drenge) (1977),medium,5.0
21656,105187,Linotype: The Film (2012),new,5.0
21658,105191,Rocaterrania (2009),new,5.0


Unnamed: 0,movieId,title,age,average
26465,129784,Xuxa in Crystal Moon (1990),new,0.5
18534,92479,Kisses for My President (1964),old,0.5
26475,129834,Tom and Jerry: The Lost Dragon (2014),new,0.5
24207,115631,Alone for Christmas (2013),new,0.5
25043,119909,Sharpe's Eagle (1993),new,0.5


In [242]:
print('Length of Total Movie Ratings List\n', movie_ratings.shape)

# Top 20 percent
best = movie_ratings.iloc[:5349,:]
best_median = best["average"].median()
print('\nBest 20 percent (Median:', round(best_median, 2), ')\n', best.shape) 
best.head()

# Middle 20 percent
ok = movie_ratings.iloc[10697:16047,:]
ok_median = ok["average"].median()
print('\nMiddle 20 percent (Median:', round(ok_median, 2), ')\n', ok.shape)
ok.tail()

# Worst 20 percent
worst = movie_ratings.iloc[21395:,:]
worst_median = worst["average"].median()
print('\nWorst 20 percent (Median:', round(worst_median, 2), ')\n', worst.shape)
worst.tail()

Length of Total Movie Ratings List
 (26744, 4)

Best 20 percent (Median: 3.85 )
 (5349, 4)


Unnamed: 0,movieId,title,age,average
19152,95517,"Barchester Chronicles, The (1982)",medium,5.0
21842,105846,Only Daughter (2013),new,5.0
17703,89133,Boys (Drenge) (1977),medium,5.0
21656,105187,Linotype: The Film (2012),new,5.0
21658,105191,Rocaterrania (2009),new,5.0



Middle 20 percent (Median: 3.24 )
 (5350, 4)


Unnamed: 0,movieId,title,age,average
248,251,"Hunted, The (1995)",new,3.050258
2167,2252,Hero (1992),new,3.050031
21239,103651,Tai Chi Hero (2012),new,3.05
9599,30890,"Keys to the House, The (Chiavi di casa, Le) (2...",new,3.05
18856,94061,Madhouse (1974),medium,3.05



Worst 20 percent (Median: 2.3 )
 (5349, 4)


Unnamed: 0,movieId,title,age,average
26465,129784,Xuxa in Crystal Moon (1990),new,0.5
18534,92479,Kisses for My President (1964),old,0.5
26475,129834,Tom and Jerry: The Lost Dragon (2014),new,0.5
24207,115631,Alone for Christmas (2013),new,0.5
25043,119909,Sharpe's Eagle (1993),new,0.5


In [280]:
# Check for null values in our dataframe
movie_ratings.isnull().values.any()

False

In [295]:
skFold = StratifiedKFold(n_splits=5, shuffle=True, random_state=None)

X = movie_ratings[:][:,3]
y = movie_ratings[:][:,2]

# X is the feature set and y is the target
# Enumerate splits
for train, test in kfold.split(movie_ratings):
    print('train: %s, test: %s' % (train, test))
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]

train: [    0     1     3 ... 26741 26742 26743], test: [    2    11    19 ... 26720 26721 26723]
train: [    1     2     3 ... 26741 26742 26743], test: [    0     4    13 ... 26718 26719 26730]
train: [    0     1     2 ... 26741 26742 26743], test: [    6     7    22 ... 26716 26737 26738]
train: [    0     1     2 ... 26741 26742 26743], test: [    3     5     8 ... 26733 26735 26739]
train: [    0     1     2 ... 26739 26742 26743], test: [   17    36    55 ... 26736 26740 26741]
train: [    0     2     3 ... 26740 26741 26742], test: [    1    10    21 ... 26712 26727 26743]
train: [    0     1     2 ... 26741 26742 26743], test: [    9    47    64 ... 26726 26728 26731]
train: [    0     1     2 ... 26740 26741 26743], test: [   14    15    18 ... 26725 26734 26742]


In [285]:
# Extract 55 percent of our dataset randomly, from which to predict outcomes of remaining 45 percent

data_array = np.random.shuffle(movie_ratings)

X_learning = data_array[:14709][:,3]
Y_learning = data_array[:14709][:,2]

In [288]:
# "Teach" our machine (SVC) about the relationships between 'average' and 'age'
# (but first we need to reshape our X np_array into a 1-D array)

svc = SVC()
svc.fit(X_learning.reshape(-1, 1), Y_learning)

# Use the remaining 45 percent of our data as test data
X = data_array[-12035:][:,3]
Y = data_array[-12035:][:,2]

predictions = svc.predict(X.reshape(-1, 1))


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [287]:
print("Predicted Results:", predictions)
print("\nActual Results:", Y)
print("\nAccuracy rating:  %f" % (accuracy_score(Y, predictions)))

Predicted Results: ['new' 'new' 'new' ... 'new' 'new' 'new']

Actual Results: ['old' 'medium' 'old' ... 'new' 'new' 'new']

Accuracy rating:  0.629331


In [207]:
print(confusion_matrix(Y, predictions))
print(classification_report(Y, predictions))

[[   0 2027    0    0]
 [   0 7574    0    0]
 [   0 2424    0    0]
 [   0   10    0    0]]
             precision    recall  f1-score   support

     medium       0.00      0.00      0.00      2027
        new       0.63      1.00      0.77      7574
        old       0.00      0.00      0.00      2424
    unknown       0.00      0.00      0.00        10

avg / total       0.40      0.63      0.49     12035



  'precision', 'predicted', average, warn_for)


In [189]:
# /anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
#   'precision', 'predicted', average, warn_for)