In [2]:
from datetime import datetime
from data_loader import dataloader
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
import pickle
from sklearn.metrics import mean_squared_error
import pandas as pd
import csv

In [3]:
user,ratings,movies = dataloader()
print(user.head())
print(movies.tail())
print(ratings.head())

   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
2        3      M   25          15  55117
3        4      M   45           7  02460
4        5      M   25          20  55455
      movie_id                       title          genres
3878      3948     Meet the Parents (2000)          Comedy
3879      3949  Requiem for a Dream (2000)           Drama
3880      3950            Tigerland (2000)           Drama
3881      3951     Two Family House (2000)           Drama
3882      3952       Contender, The (2000)  Drama|Thriller
   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291


In [4]:
movie_index_by_id = {id: i for i, id in enumerate(movies["movie_id"])}
print(movie_index_by_id)
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
genre_index_by_name = {name:i for i, name in enumerate(genres)}
print(genre_index_by_name)

{1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8, 10: 9, 11: 10, 12: 11, 13: 12, 14: 13, 15: 14, 16: 15, 17: 16, 18: 17, 19: 18, 20: 19, 21: 20, 22: 21, 23: 22, 24: 23, 25: 24, 26: 25, 27: 26, 28: 27, 29: 28, 30: 29, 31: 30, 32: 31, 33: 32, 34: 33, 35: 34, 36: 35, 37: 36, 38: 37, 39: 38, 40: 39, 41: 40, 42: 41, 43: 42, 44: 43, 45: 44, 46: 45, 47: 46, 48: 47, 49: 48, 50: 49, 51: 50, 52: 51, 53: 52, 54: 53, 55: 54, 56: 55, 57: 56, 58: 57, 59: 58, 60: 59, 61: 60, 62: 61, 63: 62, 64: 63, 65: 64, 66: 65, 67: 66, 68: 67, 69: 68, 70: 69, 71: 70, 72: 71, 73: 72, 74: 73, 75: 74, 76: 75, 77: 76, 78: 77, 79: 78, 80: 79, 81: 80, 82: 81, 83: 82, 84: 83, 85: 84, 86: 85, 87: 86, 88: 87, 89: 88, 90: 89, 92: 90, 93: 91, 94: 92, 95: 93, 96: 94, 97: 95, 98: 96, 99: 97, 100: 98, 101: 99, 102: 100, 103: 101, 104: 102, 105: 103, 106: 104, 107: 105, 108: 106, 109: 107, 110: 108, 111: 109, 112: 110, 113: 111, 114: 112, 115: 113, 116: 114, 117: 115, 118: 116, 119: 117, 120: 118, 121: 119, 122: 120, 123: 1

In [5]:
import numpy as np
# build binary array for movie genres
movie_features = np.zeros((len(movies), len(genres)))
for i, movie_genres in enumerate(movies["genres"]):
    for genre in movie_genres.split("|"):        
        genre_index = genre_index_by_name[genre]
        movie_features[i, genre_index] = 1
print(movie_features)

[[0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]


In [6]:
def get_weight(user_id):
    user_ratings = ratings[ratings['user_id']==user_id]
    rating = user_ratings['rating']
    movie_indexes = [movie_index_by_id[movie_id] for movie_id in user_ratings["movie_id"]]
    movie_data = movie_features[movie_indexes]
    movie_data = np.transpose(movie_data)
    c = movie_data @ rating
    for i in range(len(c)):
        c[i] = round(c[i]/sum(movie_data[i]),2)
    return c
user_model_dict = {}
for user_id in user["user_id"].unique():
    user_model_dict[user_id] = get_weight(user_id)


  c[i] = round(c[i]/sum(movie_data[i]),2)


In [7]:
print(user_model_dict)

{1: array([4.2 , 4.  , 4.11, 4.25, 4.14, 4.  ,  nan, 4.43, 4.  ,  nan,  nan,
       4.29,  nan, 3.67, 4.33, 3.67, 5.  ,  nan]), 2: array([3.5 , 3.74,  nan,  nan, 3.56, 3.58,  nan, 3.9 , 3.  , 4.  , 3.  ,
        nan, 3.33, 3.71, 3.59, 3.48, 3.73, 4.33]), 3: array([3.96, 4.  , 4.  , 4.  , 3.77,  nan,  nan, 4.  , 4.5 ,  nan, 2.67,
       4.  , 3.  , 3.8 , 3.83, 3.8 , 4.  , 4.67]), 4: array([4.16, 3.83,  nan, 4.  ,  nan, 5.  ,  nan, 4.17, 4.5 ,  nan, 4.33,
        nan,  nan, 4.  , 3.56, 3.5 , 3.33, 4.5 ]), 5: array([2.61, 3.  , 4.  , 3.83, 3.41, 3.29, 3.67, 3.1 ,  nan, 4.  , 2.8 ,
       3.33, 3.12, 3.1 , 3.07, 2.85, 3.5 , 4.  ]), 6: array([4.  , 3.75, 4.22, 3.88, 3.75, 3.  ,  nan, 3.73, 3.  ,  nan,  nan,
       4.32,  nan, 4.  , 3.5 , 3.67, 4.  , 3.62]), 7: array([4.31, 4.62,  nan,  nan, 4.33, 4.  ,  nan, 4.  ,  nan,  nan, 5.  ,
        nan, 4.  , 4.  , 4.3 , 4.6 , 4.75,  nan]), 8: array([4.03, 3.89, 4.25, 3.  , 3.89, 4.  ,  nan, 3.81,  nan,  nan, 5.  ,
       4.  , 3.5 , 4.07, 4.46, 4.0

In [8]:
import math
dict_u  = []
for i in range(len(user_model_dict)):
    for j in range(len(user_model_dict[i+1])):
        if(not(math.isnan(user_model_dict[i+1][j]))):
            dict_u.append({'user_id':i+1, 'genres':j+3953,'weight':user_model_dict[i+1][j] })
print(dict_u[0])

{'user_id': 1, 'genres': 3953, 'weight': 4.2}


In [9]:
dict = ["user_id", "genres", "weight"]
p = pd.DataFrame(columns=dict, data=dict_u )
print(p)

       user_id  genres  weight
0            1    3953    4.20
1            1    3954    4.00
2            1    3955    4.11
3            1    3956    4.25
4            1    3957    4.14
...        ...     ...     ...
93882     6040    3966    3.49
93883     6040    3967    3.47
93884     6040    3968    3.93
93885     6040    3969    3.70
93886     6040    3970    4.00

[93887 rows x 3 columns]


In [10]:
p.to_csv(r'u_dict.csv',index= True)