In [1]:
json_name = 'data.json'
pkl_name = 'casos.pkl'
csv_name = 'casos.csv'
carpeta = ''
pkl_name_ll = 'llibres.pkl'
csv_name_ll = 'llibres.csv'

In [51]:
import requests
import gzip
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler
# np seed = 0
np.random.seed(0)

In [3]:
# if casos.pkl exists, load it
try:
    casos = pd.read_pickle(carpeta+pkl_name)
    get = False
except:
    get = True

In [4]:
if get:
    # URL del archivo JSON comprimido
    url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_reviews_dedup.json.gz'

    # Realizar la solicitud GET al servidor
    response = requests.get(url, stream=True)

    # Verificar si la solicitud fue exitosa (código de estado 200)
    if response.status_code == 200:
        # Descomprimir el contenido del archivo
        with gzip.GzipFile(fileobj=response.raw) as f:
            # Leer las primeras 500 filas del JSON
            primeras_500_filas = [json.loads(next(f)[:-1].decode('utf-8')) for _ in range(500000)]

        print("JSON creat.")
    else:
        print(f"Error al descargar el archivo. Código de estado: {response.status_code}")

In [5]:
if get:
    # Read eoo.json only user_id, book_id, rating
    df = pd.DataFrame(primeras_500_filas)
    df = df[['user_id', 'book_id', 'rating']]

In [6]:
if get:
    # Plot rating distribution and save to eoo/rating_distribution.png
    sns.set_style('darkgrid')
    plt.figure(figsize=(10, 6))
    sns.countplot(x='rating', data=df)
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.title('Rating Distribution')
    plt.savefig(f'{carpeta}rating_distribution.png')

In [7]:
if get:
    # Give me unique users
    unique_users = df['user_id'].unique()

In [8]:
if get:
    # Make a database with unique users, list of books rated and list of rating for each book
    df_aux = pd.DataFrame(columns=['user_id', 'books', 'ratings'])

    for user in unique_users:
        # Filter by user
        user_df = df[df['user_id'] == user]
        # Get list of books rated by user
        books = user_df['book_id'].tolist()
        # Get list of ratings for each book
        ratings = user_df['rating'].tolist()
        # Create a dictionary with books and ratings
        user_dict = dict(zip(books, ratings))
        # Save user, books and ratings in df_aux using pd.concat
        df_aux = pd.concat([df_aux, pd.DataFrame({'user_id': [user], 'books': [books], 'ratings': [ratings]})])

    df_aux = df_aux.reset_index(drop=True)

    print("Dataset joined. Unique users:", len(df_aux))

In [9]:
if get:
    # Plot how many books each user has rated and save to eoo/books_rated_before.png
    # x: each user
    # y: number of books rated
    plt.figure(figsize=(10, 6))
    plt.xlabel('user_id')
    plt.ylabel('Number of books rated')
    plt.title('Number of books rated by each user')
    plt.plot(df_aux['user_id'], df_aux['books'].apply(lambda x: len(x)))
    plt.savefig(f'{carpeta}books_rated_before.png')

In [10]:
if get:
    min_books = 10
    max_books = 20

    # Remove users that have rated less than 10 books and more than 50
    df_aux = df_aux[df_aux['books'].apply(lambda x: len(x) >= min_books and len(x) <= max_books)]
    df_aux = df_aux.reset_index(drop=True)

    print(f"Dataset filtered with users with more than {min_books} and less than {max_books} books reviewed. Unique users:", len(df_aux))

In [11]:
if get:
    # Plot how many books each user has rated and save to eoo/books_rated_after.png
    # x: each user
    # y: number of books rated
    plt.figure(figsize=(10, 6))
    plt.xlabel('user_id')
    plt.ylabel('Number of books rated')
    plt.title('Number of books rated by each user')
    plt.plot(df_aux['user_id'], df_aux['books'].apply(lambda x: len(x)))
    plt.savefig(f'{carpeta}books_rated_after.png')

In [12]:
if get:
    # For each user get 3 last books and their ratings and put them in a new column "llibres_recomanata" i "puntuacions_llibres". Then remove the 3 books from the list of books rated by the user.
    df_aux['llibres_recomanats'] = df_aux['books'].apply(lambda x: x[-3:])
    df_aux['puntuacions_llibres'] = df_aux['ratings'].apply(lambda x: x[-3:])
    df_aux['books'] = df_aux['books'].apply(lambda x: x[:-3])
    df_aux['ratings'] = df_aux['ratings'].apply(lambda x: x[:-3])

    print("Done creating new columns.")

    # Change "books" and "ratings" columns to "llibres_usuari" and "val_llibres"
    df_aux = df_aux.rename(columns={'books': 'llibres_usuari', 'ratings': 'val_llibres'})

In [13]:
if get:
    df_aux.to_pickle(pkl_name)
    df_aux.to_csv(csv_name, index=False)
casos = pd.read_pickle(carpeta+pkl_name)

In [14]:
try:
    llibres = pd.read_pickle(carpeta+pkl_name_ll)
    get = False
except:
    get = True

In [15]:
if get:
    # For each row, add all the books from "llibres_usuari" and "llibres_recomanats" to a set
    set_llibres = set()
    for index, row in casos.iterrows():
        for llibre in row['llibres_usuari']:
            set_llibres.add(llibre)
        for llibre in row['llibres_recomanats']:
            set_llibres.add(llibre)

    set_llibres = list(set_llibres)
    print(len(set_llibres))

In [16]:
if get:
    fitxer = "/Users/ucemarc/Downloads/goodreads_books.json"
    # Crear un DataFrame vacío para almacenar los libros que coincidan
    df_llibres = pd.DataFrame(columns=['isbn', 'book_id', 'similar_books', 'average_rating', 'description', 'authors', 'isbn13', 'num_pages', 'publication_year', 'title', 'language_code'])

    # Leer el archivo línea por línea
    with open(fitxer, 'r', encoding='utf-8') as file:
        for line in file:
            book = json.loads(line)
            if book['book_id'] in set_llibres:
                # Only keep the columns "isbn", "book_id", "similar_books", "average_rating", "similar_books", "description", "authors", "isbn13", "num_pages", "publication_year", "title" and "language_code"
                book = {k: book[k] for k in ['isbn', 'book_id', 'similar_books', 'average_rating', 'similar_books', 'description', 'authors', 'isbn13', 'num_pages', 'publication_year', 'title', 'language_code']}
                aut = []
                for author in book['authors']:
                    aut.append(author['author_id'])
                book['authors'] = aut
                # Convert the dictionary to a DataFrame
                book = pd.DataFrame([book], index=[0])
                # Add the book to the DataFrame
                df_llibres = pd.concat([df_llibres, pd.DataFrame(book, index=[0])])
    df_llibres.to_csv("llibres.csv", index=False)
    df_llibres.to_pickle("llibres.pkl")

In [20]:
# If column "genres" exists in llibres.pkl then get = False
try:
    llibres = pd.read_pickle(carpeta+pkl_name_ll)
    llibres['genres']
    get = False
except:
    get = True
    df_llibres = pd.read_csv(carpeta+csv_name_ll)

In [21]:
if get:
    fitxer = "/Users/ucemarc/Downloads/goodreads_book_genres_initial.json"

    # Crear un DataFrame vacío para almacenar los libros que coincidan
    df_genres = pd.DataFrame(columns=['book_id', 'genres'])

    with open(fitxer, 'r', encoding='utf-8') as file:
        for line in file:
            book = json.loads(line)
            if book['book_id'] in set_llibres:
                # Only keep the columns "isbn", "book_id", "similar_books", "average_rating", "similar_books", "description", "authors", "isbn13", "num_pages", "publication_year", "title" and "language_code"
                book = {k: book[k] for k in ['book_id', 'genres']}
                # Get only the keys of the dictionary
                book['genres'] = list(book['genres'].keys())
                # Convert the dictionary to a DataFrame
                book = pd.DataFrame([book], index=[0])
                # Add the book to the DataFrame
                df_genres = pd.concat([df_genres, pd.DataFrame(book, index=[0])])
                df_genres.to_csv("genres.csv", index=False)

In [22]:
if get:
    # Merge df_llibres and df_genres on book_id
    df_llibres['book_id'] = df_llibres['book_id'].astype(int)
    df_genres['book_id'] = df_genres['book_id'].astype(int)
    df_llibres= pd.merge(df_llibres, df_genres, on='book_id', how='inner')
    df_llibres.to_csv("llibres.csv", index=False)

In [23]:
if get:
    # Check how many unique genres there are
    unique_genres = set()
    for index, row in df_llibres.iterrows():
        for genre in row['genres']:
            unique_genres.add(genre)
    print(len(unique_genres))
    print(unique_genres)

In [24]:
if get:
    # Replace 'history, historical fiction, biography' to 'history'
    df_llibres['genres'] = df_llibres['genres'].apply(lambda x: ['history' if i == 'history, historical fiction, biography' else i for i in x])
    # Replace 'fantasy, paranormal' to 'fantasy'
    df_llibres['genres'] = df_llibres['genres'].apply(lambda x: ['fantasy' if i == 'fantasy, paranormal' else i for i in x])
    # Replace 'mystery, thriller, crime' to 'mystery'
    df_llibres['genres'] = df_llibres['genres'].apply(lambda x: ['mystery' if i == 'mystery, thriller, crime' else i for i in x])
    # Replace 'comics, graphic' to 'comics'
    df_llibres['genres'] = df_llibres['genres'].apply(lambda x: ['comics' if i == 'comics, graphic' else i for i in x])
    df_llibres.to_csv("llibres.csv", index=False)

In [25]:
if get:
    # Check how many unique genres there are
    unique_genres = set()
    for index, row in df_llibres.iterrows():
        for genre in row['genres']:
            unique_genres.add(genre)
    print(len(unique_genres))
    print(unique_genres)

In [83]:
llibres = pd.read_pickle(carpeta+pkl_name_ll)
casos = pd.read_pickle(carpeta+pkl_name)

In [84]:
categories = {
    "estil_literari": ["realisme", "romanticisme", "naturalisme", "simbolisme", "modernisme", "realisme magico", "postmodernisme"],
    "complexitat": ["baixa", "mitjana", "alta"],
    "caracteristiques": ["simples", "complexes"],
    "desenvolupament_del_personatge": ["baix", "mitja", "alt"],
    "accio_o_reflexio": ["accio", "reflexio"],
    "epoca": ["actual", "passada", "futura"],
    "detall_cientific": ["baix", "mitja", "alta"]
}

In [85]:
def make_vector(length1, length2, unique_min, unique_max, categorie):
    # Número de valores únicos (entre 2 y 4)
    num_unique_values = np.random.randint(unique_min, unique_max)

    # Seleccionar valores únicos de forma aleatoria
    unique_values = np.random.choice(categories[categorie], size=num_unique_values, replace=False)

    # Crear el vector de 10 posiciones
    vector1 = [np.random.choice(unique_values) for _ in range(length1)]
    vector2 = [np.random.choice(unique_values) for _ in range(length2)]
    return vector1, vector2

In [86]:
# Per cada ususari
estil_literari = [set() for _ in range(len(llibres))]
complexitat = [set() for _ in range(len(llibres))]
caracteristiques = [set() for _ in range(len(llibres))]
desenvolupament_del_personatge = [set() for _ in range(len(llibres))]
accio_o_reflexio = [set() for _ in range(len(llibres))]
epoca = [set() for _ in range(len(llibres))]
detall_cientific = [set() for _ in range(len(llibres))]

for index, row in casos.iterrows():
    len_llibres_usuari = len(row['llibres_usuari'])
    len_llibres_recomanats = len(row['llibres_recomanats'])
    estil_literari1, estil_literari2 = make_vector(len_llibres_usuari, len_llibres_recomanats, 2, 4, "estil_literari")
    complexitat1, complexitat2 = make_vector(len_llibres_usuari, len_llibres_recomanats, 1, 3, "complexitat")
    caracteristiques1, caracteristiques2 = make_vector(len_llibres_usuari, len_llibres_recomanats, 1, 3, "caracteristiques")
    desenvolupament_del_personatge1, desenvolupament_del_personatge2 = make_vector(len_llibres_usuari, len_llibres_recomanats, 1, 3, "desenvolupament_del_personatge")
    accio_o_reflexio1, accio_o_reflexio2 = make_vector(len_llibres_usuari, len_llibres_recomanats, 1, 2, "accio_o_reflexio")
    epoca1, epoca2 = make_vector(len_llibres_usuari, len_llibres_recomanats, 1, 3, "epoca")
    detall_cientific1, detall_cientific2 = make_vector(len_llibres_usuari, len_llibres_recomanats, 1, 3, "detall_cientific")
    # Afegir cada valor al llibre corresponent
    print(index)
    for i in range(len_llibres_usuari):
        # Put the value in the index where the book_id matches with llibres["book_id"]
        estil_literari[llibres[llibres["book_id"] == int(row['llibres_usuari'][i])].index[0]].add(estil_literari1[i])
        complexitat[llibres[llibres["book_id"] == int(row['llibres_usuari'][i])].index[0]].add(complexitat1[i])
        caracteristiques[llibres[llibres["book_id"] == int(row['llibres_usuari'][i])].index[0]].add(caracteristiques1[i])
        desenvolupament_del_personatge[llibres[llibres["book_id"] == int(row['llibres_usuari'][i])].index[0]].add(desenvolupament_del_personatge1[i])
        accio_o_reflexio[llibres[llibres["book_id"] == int(row['llibres_usuari'][i])].index[0]].add(accio_o_reflexio1[i])
        epoca[llibres[llibres["book_id"] == int(row['llibres_usuari'][i])].index[0]].add(epoca1[i])
        detall_cientific[llibres[llibres["book_id"] == int(row['llibres_usuari'][i])].index[0]].add(detall_cientific1[i])
    for i in range(len_llibres_recomanats):
        estil_literari[llibres[llibres["book_id"] == int(row['llibres_recomanats'][i])].index[0]].add(estil_literari2[i])
        complexitat[llibres[llibres["book_id"] == int(row['llibres_recomanats'][i])].index[0]].add(complexitat2[i])
        caracteristiques[llibres[llibres["book_id"] == int(row['llibres_recomanats'][i])].index[0]].add(caracteristiques2[i])
        desenvolupament_del_personatge[llibres[llibres["book_id"] == int(row['llibres_recomanats'][i])].index[0]].add(desenvolupament_del_personatge2[i])
        accio_o_reflexio[llibres[llibres["book_id"] == int(row['llibres_recomanats'][i])].index[0]].add(accio_o_reflexio2[i])
        epoca[llibres[llibres["book_id"] == int(row['llibres_recomanats'][i])].index[0]].add(epoca2[i])
        detall_cientific[llibres[llibres["book_id"] == int(row['llibres_recomanats'][i])].index[0]].add(detall_cientific2[i])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [87]:
print(len(estil_literari))
print(len(llibres))

15666
15666


In [88]:
# Afegir les noves columnes al DataFrame
llibres["estil_literari"] = estil_literari
llibres["complexitat"] = complexitat
llibres["caracteristiques"] = caracteristiques
llibres["desenvolupament_del_personatge"] = desenvolupament_del_personatge
llibres["accio_o_reflexio"] = accio_o_reflexio
llibres["epoca"] = epoca
llibres["detall_cientific"] = detall_cientific

In [89]:
llibres.to_pickle("2"+pkl_name_ll)
llibres.to_csv("2"+csv_name_ll, index=False)