# CS 406 Movie Recommender

### Ryder McDowell

#### OSU Cascades

...

...

# Fetch Data

In [97]:
!curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip
!unzip -o ml-100k.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4808k  100 4808k    0     0  2526k      0  0:00:01  0:00:01 --:--:-- 2525k
Archive:  ml-100k.zip
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100

In [273]:
print("Training Data:")
!head -10 ./ml-100k/ua.base

print("\nTesting Data:")
!head -10 ./ml-100k/ua.test

Training Data:
1	1	5	874965758
1	2	3	876893171
1	3	4	878542960
1	4	3	876893119
1	5	3	889751712
1	6	5	887431973
1	7	4	875071561
1	8	1	875072484
1	9	5	878543541
1	10	3	875693118

Testing Data:
1	20	4	887431883
1	33	4	878542699
1	61	4	878542420
1	117	3	874965739
1	155	2	878542201
1	160	4	875072547
1	171	5	889751711
1	189	3	888732928
1	202	5	875072442
1	265	4	878542441


# Import Libraries

In [99]:
import csv, io, json
import numpy as np
from scipy.sparse import lil_matrix

# Load Data

In [280]:
def get_samples(csv_reader):
    samples = []
    for userId,movieId,rating,timestamp in csv_reader:
        samples.append({
            'userId': userId,
            'movieId': movieId,
            'rating': rating,
            'timestamp': timestamp
        })
        
    return samples
    
    
def get_maximums(samples):
    users = []
    movies = []
    for sample in samples:
        users.append(int(sample['userId']))
        movies.append(int(sample['movieId']))

    max_user_id = max(users)
    max_movie_id = max(movies)
    
    return max_user_id, max_movie_id


def get_matrix_shape(max_user_id, max_movie_id, samples):
    total_samples = len(samples)
    total_features = max_user_id + max_movie_id

    return total_samples, total_features


def fill_data(data, labels, samples):
    row = 0
        
    # Build matrix and labels
    for sample in samples:

        # One hot-encode userId and movieId at row
        user_index = int(sample['userId']) - 1
        movie_index = 943 + int(sample['movieId']) - 1    #!!

        data[row, user_index] = 1
        data[row, movie_index] = 1

        # Append binary to labels for whether user "enjoyed" movie
        if int(sample['rating']) >= 4:
            labels.append(1)
        else:
            labels.append(0)

        row = row + 1

    # Convert labels list to float 32
    labels = np.array(labels).astype('float32')
    
    return data, labels
    

def load_dataset(training_data_file_path, testing_data_file_path):
    # Training Data
    with open(training_data_file_path, 'r') as file:
        csv_reader = csv.reader(file, delimiter='\t')
        
        # Get all training samples in form of [{}, {}, ...]
        training_samples = get_samples(csv_reader)
        
        # Get maximum number of users and movies
        max_user_id, max_movie_id = get_maximums(training_samples)
        
        # Get shape of training matrix
        training_matrix_shape = get_matrix_shape(max_user_id, max_movie_id, training_samples)
        
        # Initialize training data and labels structures
        training_data = lil_matrix(training_matrix_shape).astype('float32')
        training_labels = []

        # Fill training data and labels structures with sample training data 
        training_data, training_labels = fill_data(training_data, training_labels, training_samples)
        
    # Testing Data
    with open(testing_data_file_path, 'r') as file:
        csv_reader = csv.reader(file, delimiter='\t')
        
        # Get all testing samples in form of [{}, {}, ...]
        testing_samples = get_samples(csv_reader)
        
        #Get shape of testing matrix
        testing_matrix_shape = get_matrix_shape(max_user_id, max_movie_id, testing_samples)
        
        # Initialize testing data and labels structures
        testing_data = lil_matrix(testing_matrix_shape).astype('float32')
        testing_labels = []
        
        # Fill testing data and labels structurs with sample testing data
        testing_data, testing_labels = fill_data(testing_data, testing_labels, testing_samples)
        
    
    
    return (training_data, training_labels), (testing_data, testing_labels)

In [275]:
training_data_file_path = './ml-100k/ua.base'
testing_data_file_path = './ml-100k/ua.test'

(training_data, training_labels), (testing_data, testing_labels) = load_dataset(training_data_file_path, testing_data_file_path)

# Summary Statistics

### Shapes

In [276]:
print("(Ratings, Features)")
print(training_data.shape)
print(training_labels.shape)

print(testing_data.shape)
print(testing_labels.shape)

(Ratings, Features)
(90570, 2625)
(90570,)
(9430, 2625)
(9430,)


### Insight

In [277]:
print(training_data[1000:1005])
print(training_labels[1000:1005])

print(training_data[1000:1005])
print(testing_labels[1000:1005])

  (0, 6)	1.0
  (0, 1493)	1.0
  (1, 6)	1.0
  (1, 1494)	1.0
  (2, 6)	1.0
  (2, 1495)	1.0
  (3, 6)	1.0
  (3, 1496)	1.0
  (4, 6)	1.0
  (4, 1497)	1.0
[0. 1. 0. 0. 1.]
  (0, 6)	1.0
  (0, 1493)	1.0
  (1, 6)	1.0
  (1, 1494)	1.0
  (2, 6)	1.0
  (2, 1495)	1.0
  (3, 6)	1.0
  (3, 1496)	1.0
  (4, 6)	1.0
  (4, 1497)	1.0
[0. 0. 0. 0. 0.]


### Label Balance

In [278]:
print("{:0.2f}% Movies Rated Above 3 in Training Data".format(np.count_nonzero(training_labels) / training_data.shape[0] * 100))
print("{:0.2f}% Movies Rated Above 3 in Testing Data".format(np.count_nonzero(testing_labels) / testing_data.shape[0] * 100))

55.10% Movies Rated Above 3 in Training Data
58.00% Movies Rated Above 3 in Testing Data


### Sparcity

In [279]:
encoded_values = training_data.shape[0] * 2
total_values = training_data.shape[0] * training_data.shape[1]

print("{:0.5f}% Sparse".format(100 - (encoded_values / total_values)))

99.99924% Sparse
