# Preprocess the data

Create the following matrix:

*   Rows: UserID's
*   Columns: MovieID's
*   Cell: The rating of the movie


In [1]:
!pip install pyspark
!apt install openjdk-8-jdk-headless -qq

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 67kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 18.5MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=bcd2f36f583c7bb48bb8a60a5e919b4bea2d03d2f2c51717c274e8c1b5f25acd
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1
The 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.decomposition import SparsePCA

import scipy.sparse as sparse

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

from datetime import datetime
import os

import json

In [3]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [4]:
from google.colab import drive
drive.mount("/content/drive",force_remount=True)

Mounted at /content/drive


In [5]:
dateparse = lambda x: datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S')

files_path = '/content/drive/MyDrive/CSE547_Final_Project/ml-100k'
ratings_file = os.path.join(files_path, "ratings.csv")
movies_file = os.path.join(files_path, "movies.csv")
user_movie_ratings_matrix = os.path.join(files_path, "user_movie_ratings_matrix_2.csv")
user_to_idx_file = os.path.join(files_path, "user_to_idx.json")
movie_to_idx_file = os.path.join(files_path, "movie_to_idx.json")

ratings_df = pd.read_csv(   ratings_file, 
                            parse_dates=['timestamp'], 
                            date_parser=dateparse)
movies_df = pd.read_csv(movies_file)

In [6]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30 18:45:03
1,1,3,4.0,2000-07-30 18:20:47
2,1,6,4.0,2000-07-30 18:37:04
3,1,47,5.0,2000-07-30 19:03:35
4,1,50,5.0,2000-07-30 18:48:51


In [7]:
ratings_df.drop(columns=['timestamp'], inplace=True)

In [8]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [9]:
X, y = ratings_df.iloc[:, :-1], ratings_df.iloc[:, -1]

In [10]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=37)

In [11]:
X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.5, random_state=72)

In [12]:
print(X_train.shape, X_validate.shape, X_test.shape)
print(y_train.shape, y_validate.shape, y_test.shape)

(40334, 2) (40334, 2) (20168, 2)
(40334,) (40334,) (20168,)


In [13]:
X_train.head()

Unnamed: 0,userId,movieId
40534,274,74532
97858,606,3148
68593,447,257
56612,376,2571
36416,249,522


In [14]:
y_train.head()

40534    2.5
97858    4.0
68593    3.0
56612    3.5
36416    4.0
Name: rating, dtype: float64

In [15]:
def create_utility_matrix(X, y, formatizer = {'userId':0, 'movieId': 1}):        
    # itemField = formatizer['movieId']
    # userField = formatizer['userId']
    # # valueField = formatizer['value']

    # userList = X.iloc[:,userField].tolist()
    # itemList = X.iloc[:,itemField].tolist()
    # valueList = y.tolist()
    
    # users = list(set(X.iloc[:,userField]))
    # items = list(set(X.iloc[:,itemField]))
    # users_index = {users[i]: i for i in range(len(users))}
    # items_index = {items[i]: i for i in range(len(items))}
    
    # pd_dict = {item: [0.0 for i in range(len(users))] for item in items}
    # for i in range(0, len(X)):
    #     item = itemList[i]
    #     user = userList[i]
    #     value = valueList[i]
    
    #     pd_dict[item][users_index[user]] = value
    
    # sparse_df = pd.DataFrame(pd_dict)
    # sparse_df.index = users

    # # users_index gives us a mapping of user_id to index of user
    # # items_index provides the same for items
    # return sparse_df, users_index, items_index

    itemField = formatizer['movieId']
    userField = formatizer['userId']
    userList = X.iloc[:,userField].tolist()
    itemList = X.iloc[:,itemField].tolist()
    valueList = y.tolist()

    users = list(set(X.iloc[:,userField]))
    items = list(set(X.iloc[:,itemField]))

    users_index = {users[i]: i for i in range(len(users))}
    items_index = {items[i]: i for i in range(len(items))}

    USER_LEN, ITEM_LEN = len(users_index), len(items_index)

    csr = np.zeros((USER_LEN, ITEM_LEN + 1))
    for index, row in X.iterrows():
        csr[
                users_index[int(row[userField])],
                0
            ] = row[userField]
        csr[
                users_index[int(row[userField])],
                items_index[int(row[itemField])] + 1
            ] = y[index]
    
    return csr, users_index, items_index

In [16]:
X_train_u_matrix, X_train_user_idx, X_train_item_idx = create_utility_matrix(X_train, y_train)
X_train_u_matrix_df = pd.DataFrame(X_train_u_matrix)
X_train_u_matrix_df.rename(columns = {0: 'userId',}, inplace = True)
X_train_u_matrix_df.to_csv(user_movie_ratings_matrix, index=False)

In [18]:
with open(user_to_idx_file, "w") as f1: 
    json.dump(X_train_user_idx, f1, indent=4)

In [19]:
with open(movie_to_idx_file, "w") as f2: 
    json.dump(X_train_item_idx, f2, indent=4)

In [20]:
X_train_u_matrix_df.iloc[X_train_user_idx[274], X_train_item_idx[74532] + 1]

2.5

In [21]:
X_train_u_matrix_df.head()

Unnamed: 0,userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,6853,6854,6855,6856,6857,6858,6859,6860,6861,6862,6863,6864,6865,6866,6867,6868,6869,6870,6871,6872,6873,6874,6875,6876,6877,6878,6879,6880,6881,6882,6883,6884,6885,6886,6887,6888,6889,6890,6891,6892
0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# transformer = SparsePCA(n_components=5, random_state=0)
# transformer.fit(X_train_u_matrix)
# X_transformed = transformer.transform(X_train_u_matrix)
# X_transformed.shape

In [None]:
# X_train_u_matrix.iloc[X_train_user_idx[606], X_train_item_idx[3148]]
# X_train_u_matrix
# X_train_u_matrix_sdf = spark.createDataFrame(X_train_u_matrix)

In [None]:
# pc = mat.computePrincipalComponents(20)
# projected = mat.multiply(pc)

In [None]:
# ratings_df['userId'] = ratings_df['userId'].astype('str')
# ratings_df['movieId'] = ratings_df['movieId'].astype('str')

# users = ratings_df['userId'].unique() #list of all users
# movies = ratings_df['movieId'].unique() #list of all movies

# test = pd.DataFrame(columns=ratings_df.columns)
# train = pd.DataFrame(columns=ratings_df.columns)

# test_ratio = 0.2 #fraction of data to be used as test set.

# for u in users:
#     temp = ratings_df[ratings_df['userId'] == u]
#     n = len(temp)
#     test_size = int(test_ratio*n)

# temp = temp.sort_values('timestamp').reset_index()
# temp.drop('index', axis=1, inplace=True)
    
# dummy_test = temp.iloc[n-1-test_size :]
# dummy_train = temp.iloc[: n-2-test_size]
    
# test = pd.concat([test, dummy_test])
# train = pd.concat([train, dummy_train])

In [None]:
# def rmse(true, pred):
#     # this will be used towards the end
#     x = true - pred
#     return sum([xi*xi for xi in x])/len(x)

In [None]:
# # to test the performance over a different number of features
# no_of_features = [100, 200, 300, 400, 500]

# utilMat, users_index, items_index = create_utility_matrix(train)
# print(f'Number of features: {len(items_index)}')
# print(f'Number of users: {len(users_index)}')

# for f in no_of_features: 
#     svdout = svd(utilMat, k=f)
#     pred = [] #to store the predicted ratings
    
#     for _,row in test.iterrows():
#         user = row['userId']
#         item = row['movieId']
#         u_index = users_index[user]
        
#         if item in items_index:
#             i_index = items_index[item]
#             pred_rating = svdout[u_index, i_index]
#         else:
#             pred_rating = np.mean(svdout[u_index, :])
        
#         pred.append(pred_rating)

# print(rmse(test['rating'], pred))

https://medium.com/hackernoon/principal-component-analysis-unsupervised-learning-model-8f18c7683262