# User Info Matrix Preprocessing

To input the data concerning the user data as well as the genre information of the movies the user likes, I will build a matrix as follows:
First row is the gender: 0 for male, 1 for female
Second row is the age: Normalized
Remaining rows are the different genres. The value of each cell is equal to the average rating of that genre by that user, excluding 0 values from consideration. 

In [6]:
import pandas as pd
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

raw_data = pd.read_csv("./data/Movielens100/u.data", sep = None, names=["userId", "movieId", "rating", "timestamp"])
raw_data = raw_data.loc[:, raw_data.columns != "timestamp"]
#make indices start at 0
raw_data["userId"] -= 1
raw_data["movieId"] -= 1
#make ratings center around 0
raw_data["rating"] -= 3

# create (943, 1682) matrix of user ratings per movie
user_ratings = pd.DataFrame(np.zeros((943,1682)))
for i in raw_data.index:
    user_ratings[raw_data["movieId"][i]][raw_data["userId"][i]] = raw_data["rating"][i]
user_ratings = user_ratings.to_numpy() 
user_ratings

  raw_data = pd.read_csv("./data/Movielens100/u.data", sep = None, names=["userId", "movieId", "rating", "timestamp"])


array([[2., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [2., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 2., 0., ..., 0., 0., 0.]])

## User Taste matrix

The first step is to get the average rating per genre of each user and then normalize that.

In [7]:
movie_genres = pd.read_csv("./data/Movielens100/u.item", sep = "|", encoding='latin-1', names= ["movie id" , "movie title", "release date", "video release date",
                                                                                        "imdb_url", "unknown", "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime",
                                                                                        "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", 
                                                                                        "War", "Western"])
movie_genres["movie id"] = movie_genres["movie id"] - 1
movie_genres = movie_genres.drop(["movie id", "movie title", "release date", "video release date", "imdb_url"], axis=1)
movie_genres = movie_genres.to_numpy()
movie_genres

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [94]:
user_tastes = np.zeros((len(user_ratings), len(movie_genres[0])))
for u in range(len(user_ratings)):
    num_ratings = len(user_ratings[u].nonzero()[0])
    user_tastes[u] = np.dot(movie_genres.T, user_ratings[u]) / num_ratings
user_tastes -= user_tastes.mean()
user_tastes /= user_tastes.std()
user_tastes

array([[-0.5285767 ,  0.16328159, -0.64388641, ...,  0.36507359,
        -0.06733784, -0.44209441],
       [-0.55740413,  0.54956914, -0.0039175 , ...,  0.41119748,
        -0.28066081, -0.55740413],
       [-0.55740413, -1.03638294, -0.23808492, ..., -2.15400018,
        -0.71706373, -0.55740413],
       ...,
       [-0.55740413,  1.93328572,  1.31061326, ...,  1.62194949,
         0.06526833, -0.55740413],
       [-0.55740413,  1.30131964,  1.20838345, ...,  1.30131964,
         1.02251107, -0.09272319],
       [-0.55740413,  0.99927702, -0.15711469, ...,  0.7324174 ,
        -0.20159129, -0.73531055]])

## User information matrix 
Concatenate user taste matrix with age and gender, forming the user information matrix

First column is normalized age, second column is gender, the rest are the average ratings per genre. Genres are listed in u.genre file


In [88]:
user_data = pd.read_csv("./data/Movielens100/u.user", sep = "|", encoding='latin-1', names= ["user id", "age" , "gender" , "occupation", "zip code"])
user_data = user_data.drop(["user id", "occupation", "zip code"], axis=1)
user_data["gender"] = (user_data["gender"] == "M").astype(np.float32)
user_data["age"] -= user_data["age"].mean()
user_data["age"] /= user_data["age"].std()
user_data = user_data.to_numpy()

array([[-0.82442191,  1.        ],
       [ 1.5540427 ,  0.        ],
       [-0.90643793,  1.        ],
       ...,
       [-1.152486  ,  1.        ],
       [ 1.14396259,  0.        ],
       [-0.98845396,  1.        ]])

In [100]:
user_info = np.concatenate((user_data, user_tastes), axis = 1)
pd.DataFrame(user_info).to_csv("./data/user_info.csv")

In [4]:
user_data = pd.read_csv("./data/Movielens100/u.user", sep = "|", encoding='latin-1', names= ["user id", "age" , "gender" , "occupation", "zip code"])
user_data = user_data.drop(["user id", "occupation", "zip code"], axis=1)
user_data["age"].std()
#user_data["gender"] = (user_data["gender"] == "M").astype(np.float32)
#user_data["age"] -= user_data["age"].mean()
#user_data["age"] /= user_data["age"].std()
#user_data = user_data.to_numpy()

12.192739733059044