# Data Analysis of Movies by Rating Relative to Age

In [148]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.svm import SVC
import datetime as dt
import numpy as np
import pandas as pd
import re

In [None]:
# Data source: https://grouplens.org/datasets/movielens/20m/
# Successfully matched MD5 checksum with reference provided (data verified)

moviesMstr = pd.read_csv("/Users/Levient/Movie_Ratings/ml-20m/movies.csv")
ratingsMstr = pd.read_csv("/Users/Levient/Movie_Ratings/ml-20m/ratings.csv")

In [21]:
# All rows for 'movieId' and 'title(year)'

movies = moviesMstr[["movieId", "title"]]

# All rows for 'movieId' and 'rating'

ratings = ratingsMstr[["movieId", "rating"]]

print('Movies\n', movies.head(), '\nRatings\n', ratings.head())

Movies
    movieId                               title
0        1                    Toy Story (1995)
1        2                      Jumanji (1995)
2        3             Grumpier Old Men (1995)
3        4            Waiting to Exhale (1995)
4        5  Father of the Bride Part II (1995) 
Ratings
    movieId  rating
0        2     3.5
1       29     3.5
2       32     3.5
3       47     3.5
4       50     3.5


In [43]:
# Average all ratings per movie and add 'average column'

ratings.loc[:,'average'] = ratings.groupby('movieId')['rating'].transform('mean')

print('Average Ratings for All Movies\n', ratings.head())

Average Ratings for All Movies
    movieId  rating   average
0        2     3.5  3.211977
1       29     3.5  3.952230
2       32     3.5  3.898055
3       47     3.5  4.053493
4       50     3.5  4.334372


In [71]:
# Remove duplicate ratings rows from ratings, drop rating column

print('Before\n', ratings.shape)

average_ratings = ratings.drop_duplicates(subset=['movieId', 'average']).drop(columns=['rating'])

print('After\n', average_ratings.shape)
print('Total movies', movies.shape)
print('Averages\n', average_ratings.head())

# Issue to explore: There are more movies than there are average reviews. 

Before
 (20000263, 3)
After
 (26744, 2)
Total movies (27278, 3)
Averages
    movieId   average
0        2  3.211977
1       29  3.952230
2       32  3.898055
3       47  4.053493
4       50  4.334372


In [62]:
# Find median (and mean) of all ratings

ratings_median = ratings["average"].median()
ratings_mean = ratings["average"].mean()

print("Median of Average Ratings\n", ratings_median, "\nMean of Average Ratings\n", ratings_mean)

Median of Average Ratings
 3.601982097186701 
Mean of Average Ratings
 3.5255285642995635


In [47]:
# The classify_age function extracts a year in parens from 'title' and returns a string (e.g. 'old') 

def classify_age(row):
    year = extract_year(row)
    return assign_age(year)

def extract_year(row):
  year_in_parens = re.compile('(?<=\()(\d{4})(?=\))')
  result = year_in_parens.search(row['title'])
  if result:
    return result.group(0)
  else: 
    return None
  
def assign_age(year):
    if year == None:
      return "unknown"
    year = dt.datetime(int(year), 1, 1)
    if year < dt.datetime(1970, 1, 1):
        return "old"
    elif year < dt.datetime(1990, 1, 1):
        return "medium"
    else:
        return "new"

In [72]:
# Applying classify_age returns a new column 'age' with 4 classes: 'old', 'medium', 'new' and 'unknown'

movies['age'] = movies.apply(classify_age, axis=1)

print(movies.tail())

# If time permits: come back and print samples of each!!

       movieId                          title  age
27273   131254   Kein Bund für's Leben (2007)  new
27274   131256  Feuer, Eis & Dosenbier (2002)  new
27275   131258             The Pirates (2014)  new
27276   131260            Rentun Ruusu (2001)  new
27277   131262               Innocence (2014)  new


In [163]:
age_counts = movies.groupby('age')['title'].count()

print('Totals Counts of Each Age\n', age_counts)

Totals Counts of Each Age
 age
medium      4674
new        17118
old         5464
unknown       22
Name: title, dtype: int64


In [69]:
# Inner join movies and ratings by movieId. Validate one-to-one relationship.

movie_ratings = pd.merge(movies, average_ratings, validate="1:1")

print(movie_ratings.head())

   movieId                               title  age   average
0        1                    Toy Story (1995)  new  3.921240
1        2                      Jumanji (1995)  new  3.211977
2        3             Grumpier Old Men (1995)  new  3.151040
3        4            Waiting to Exhale (1995)  new  2.861393
4        5  Father of the Bride Part II (1995)  new  3.064592


In [112]:
# Sorting movies by average rating

movie_ratings = movie_ratings.sort_values(by=['average'], ascending=False)

print("Lowest by Rating\n", movie_ratings.tail())
print("\nHighest by Rating\n", movie_ratings.head())

Lowest by Rating
        movieId                         title     age  average
19755    97915     Lookin' to Get Out (1982)  medium      0.5
23186   111046  Golden Christmas 3, A (2012)     new      0.5
23185   111044    Golden Christmas, A (2009)     new      0.5
24207   115631    Alone for Christmas (2013)     new      0.5
24647   117630         Double Trouble (1992)     new      0.5

Highest by Rating
        movieId                            title  age  average
24643   117606                   Divorce (1945)  old      5.0
19258    95977               Junior Prom (1946)  old      5.0
12005    54326                Sierra, La (2005)  new      5.0
25239   121039  A Night for Dying Tigers (2010)  new      5.0
25240   121063  The House on 56th Street (1933)  old      5.0


In [113]:
print('Length of Total Movie Ratings List\n', movie_ratings.shape)

# Top 20 percent
best = movie_ratings.iloc[:5349,:]
best_median = best["average"].median()
print('\nBest 20 percent (Median:', round(best_median, 2), ')\n', best.shape, '\n', best.head())

# Middle 20 percent
ok = movie_ratings.iloc[10697:16047,:]
ok_median = ok["average"].median()
print('\nMiddle 20 percent (Median:', round(ok_median, 2), ')\n', ok.shape, '\n', ok.tail())

# Worst 20 percent
worst = movie_ratings.iloc[21395:,:]
worst_median = worst["average"].median()
print('\nWorst 20 percent (Median:', round(worst_median, 2), ')\n', worst.shape, '\n', worst.tail())

Length of Total Movie Ratings List
 (26744, 4)

Best 20 percent (Median: 3.85 )
 (5349, 4) 
        movieId                            title  age  average
24643   117606                   Divorce (1945)  old      5.0
19258    95977               Junior Prom (1946)  old      5.0
12005    54326                Sierra, La (2005)  new      5.0
25239   121039  A Night for Dying Tigers (2010)  new      5.0
25240   121063  The House on 56th Street (1933)  old      5.0

Middle 20 percent (Median: 3.24 )
 (5350, 4) 
        movieId                                              title  age  \
248        251                                 Hunted, The (1995)  new   
2167      2252                                        Hero (1992)  new   
13767    69061                            Personal Effects (2009)  new   
11321    48214  Land of Plenty (Angst and Alienation in Americ...  new   
15404    78635  Under the North Star (Täällä Pohjantähden alla...  new   

        average  
248    3.050258  
2167  

In [132]:
# Extract 55 percent of our dataset randomly, from which to predict outcomes of remaining 45 percent

data_array = movie_ratings.values
np.random.shuffle(data_array)

X_learning = data_array[:14709][:,3]
Y_learning = data_array[:14709][:,2]

In [146]:
# "Teach" our machine about the relationships between 'average' and 'age'
# (but first we need to reshape our X np array into a 1-D array)

svc = SVC()
svc.fit(X_learning.reshape(-1, 1), Y_learning)

# Use the remaining 45 percent of our data as test data
X = data_array[-12035:][:,3]
Y = data_array[-12035:][:,2]

predictions = svc.predict(X.reshape(-1, 1))

In [147]:
print("Predicted Results:", predictions)
print("\nActual Results:", Y)
print("\nAccuracy rating:  %f" % (accuracy_score(Y, predictions)))

Predicted Results: ['new' 'new' 'new' ... 'new' 'new' 'new']

Actual Results: ['new' 'new' 'medium' ... 'new' 'new' 'medium']

Accuracy rating:  0.638222


In [155]:
print(confusion_matrix(Y, predictions))
print(classification_report(Y, predictions))

[[   0 2056    0    0]
 [   0 7681    0    0]
 [   0 2292    0    0]
 [   0    6    0    0]]
             precision    recall  f1-score   support

     medium       0.00      0.00      0.00      2056
        new       0.64      1.00      0.78      7681
        old       0.00      0.00      0.00      2292
    unknown       0.00      0.00      0.00         6

avg / total       0.41      0.64      0.50     12035



  'precision', 'predicted', average, warn_for)


In [154]:
# /anaconda3/lib/python3.6/site-packages/sklearn/metrics/classification.py:1135: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
#   'precision', 'predicted', average, warn_for)