In [1]:
json_name = 'data.json'
pkl_name = 'casos.pkl'
csv_name = 'casos.csv'
carpeta = ''
pkl_name_ll = 'llibres.pkl'
csv_name_ll = 'llibres.csv'

In [2]:
import requests
import gzip
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [3]:
# if casos.pkl exists, load it
try:
    casos = pd.read_pickle(carpeta+pkl_name)
    get = False
except:
    get = True

In [4]:
if get:
    # URL del archivo JSON comprimido
    url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/goodreads_reviews_dedup.json.gz'

    # Realizar la solicitud GET al servidor
    response = requests.get(url, stream=True)

    # Verificar si la solicitud fue exitosa (código de estado 200)
    if response.status_code == 200:
        # Descomprimir el contenido del archivo
        with gzip.GzipFile(fileobj=response.raw) as f:
            # Leer las primeras 500 filas del JSON
            primeras_500_filas = [json.loads(next(f)[:-1].decode('utf-8')) for _ in range(500000)]

        print("JSON creat.")
    else:
        print(f"Error al descargar el archivo. Código de estado: {response.status_code}")

In [5]:
if get:
    # Read eoo.json only user_id, book_id, rating
    df = pd.DataFrame(primeras_500_filas)
    df = df[['user_id', 'book_id', 'rating']]

In [6]:
if get:
    # Plot rating distribution and save to eoo/rating_distribution.png
    sns.set_style('darkgrid')
    plt.figure(figsize=(10, 6))
    sns.countplot(x='rating', data=df)
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.title('Rating Distribution')
    plt.savefig(f'{carpeta}rating_distribution.png')

In [7]:
if get:
    # Give me unique users
    unique_users = df['user_id'].unique()

In [8]:
if get:
    # Make a database with unique users, list of books rated and list of rating for each book
    df_aux = pd.DataFrame(columns=['user_id', 'books', 'ratings'])

    for user in unique_users:
        # Filter by user
        user_df = df[df['user_id'] == user]
        # Get list of books rated by user
        books = user_df['book_id'].tolist()
        # Get list of ratings for each book
        ratings = user_df['rating'].tolist()
        # Create a dictionary with books and ratings
        user_dict = dict(zip(books, ratings))
        # Save user, books and ratings in df_aux using pd.concat
        df_aux = pd.concat([df_aux, pd.DataFrame({'user_id': [user], 'books': [books], 'ratings': [ratings]})])

    df_aux = df_aux.reset_index(drop=True)

    print("Dataset joined. Unique users:", len(df_aux))

In [9]:
if get:
    # Plot how many books each user has rated and save to eoo/books_rated_before.png
    # x: each user
    # y: number of books rated
    plt.figure(figsize=(10, 6))
    plt.xlabel('user_id')
    plt.ylabel('Number of books rated')
    plt.title('Number of books rated by each user')
    plt.plot(df_aux['user_id'], df_aux['books'].apply(lambda x: len(x)))
    plt.savefig(f'{carpeta}books_rated_before.png')

In [10]:
if get:
    min_books = 10
    max_books = 20

    # Remove users that have rated less than 10 books and more than 50
    df_aux = df_aux[df_aux['books'].apply(lambda x: len(x) >= min_books and len(x) <= max_books)]
    df_aux = df_aux.reset_index(drop=True)

    print(f"Dataset filtered with users with more than {min_books} and less than {max_books} books reviewed. Unique users:", len(df_aux))

In [11]:
if get:
    # Plot how many books each user has rated and save to eoo/books_rated_after.png
    # x: each user
    # y: number of books rated
    plt.figure(figsize=(10, 6))
    plt.xlabel('user_id')
    plt.ylabel('Number of books rated')
    plt.title('Number of books rated by each user')
    plt.plot(df_aux['user_id'], df_aux['books'].apply(lambda x: len(x)))
    plt.savefig(f'{carpeta}books_rated_after.png')

In [12]:
if get:
    # For each user get 3 last books and their ratings and put them in a new column "llibres_recomanata" i "puntuacions_llibres". Then remove the 3 books from the list of books rated by the user.
    df_aux['llibres_recomanats'] = df_aux['books'].apply(lambda x: x[-3:])
    df_aux['puntuacions_llibres'] = df_aux['ratings'].apply(lambda x: x[-3:])
    df_aux['books'] = df_aux['books'].apply(lambda x: x[:-3])
    df_aux['ratings'] = df_aux['ratings'].apply(lambda x: x[:-3])

    print("Done creating new columns.")

    # Change "books" and "ratings" columns to "llibres_usuari" and "val_llibres"
    df_aux = df_aux.rename(columns={'books': 'llibres_usuari', 'ratings': 'val_llibres'})

In [13]:
if get:
    df_aux.to_pickle(pkl_name)
    df_aux.to_csv(csv_name, index=False)
casos = pd.read_pickle(carpeta+pkl_name)

In [14]:
try:
    llibres = pd.read_pickle(carpeta+pkl_name_ll)
    get = False
except:
    get = True

In [15]:
if get:
    # For each row, add all the books from "llibres_usuari" and "llibres_recomanats" to a set
    set_llibres = set()
    for index, row in casos.iterrows():
        for llibre in row['llibres_usuari']:
            set_llibres.add(llibre)
        for llibre in row['llibres_recomanats']:
            set_llibres.add(llibre)

    set_llibres = list(set_llibres)
    print(len(set_llibres))

In [16]:
if get:
    fitxer = "/Users/ucemarc/Downloads/goodreads_books.json"
    # Crear un DataFrame vacío para almacenar los libros que coincidan
    df_llibres = pd.DataFrame(columns=['isbn', 'book_id', 'similar_books', 'average_rating', 'description', 'authors', 'isbn13', 'num_pages', 'publication_year', 'title', 'language_code'])

    # Leer el archivo línea por línea
    with open(fitxer, 'r', encoding='utf-8') as file:
        for line in file:
            book = json.loads(line)
            if book['book_id'] in set_llibres:
                # Only keep the columns "isbn", "book_id", "similar_books", "average_rating", "similar_books", "description", "authors", "isbn13", "num_pages", "publication_year", "title" and "language_code"
                book = {k: book[k] for k in ['isbn', 'book_id', 'similar_books', 'average_rating', 'similar_books', 'description', 'authors', 'isbn13', 'num_pages', 'publication_year', 'title', 'language_code']}
                aut = []
                for author in book['authors']:
                    aut.append(author['author_id'])
                book['authors'] = aut
                # Convert the dictionary to a DataFrame
                book = pd.DataFrame([book], index=[0])
                # Add the book to the DataFrame
                df_llibres = pd.concat([df_llibres, pd.DataFrame(book, index=[0])])
    df_llibres.to_csv("llibres.csv", index=False)
    df_llibres.to_pickle("llibres.pkl")

In [17]:
categories = {
    "estil_literari": ["realisme", "romanticisme", "naturalisme", "simbolisme", "modernisme", "realisme magico", "postmodernisme"],
    "temes_especifics": ["amor", "aventura", "terror", "fantasia", "ciencia ficcio", "historica", "filosofica", "psicologica", "social", "politica", "religiosa", "erotica", "humoristica", "costumista", "negra", "realista", "fantastica", "mitologica", "poetica", "satirica", "biografica", "epica", "didactica", "teatral", "lirica", "epistolar", "dramatica", "epica", "didactica", "teatral", "lirica", "epistolar", "dramatica"],
    "complexitat": ["baixa", "mitjana", "alta"],
    "caracteristiques": ["simples", "complexes"],
    "desenvolupament_del_personatge": ["baix", "mitja", "alt"],
    "accio_o_reflexio": ["accio", "reflexio"],
    "longitud": ["curta", "mitjana", "llarga"],
    "epoca": ["actual", "passada", "futura"],
    "detall_cientific": ["baix", "mitja", "alta"]
}

In [22]:
def make_vector(length, unique_min, unique_max, categorie):
    # Número de valores únicos (entre 2 y 4)
    num_unique_values = np.random.randint(unique_min, unique_max)

    # Seleccionar valores únicos de forma aleatoria
    unique_values = np.random.choice(categories[categorie], size=num_unique_values, replace=False)

    # Crear el vector de 10 posiciones
    vector = [np.random.choice(unique_values) for _ in range(length)]
    return vector

In [23]:
# Per cada ususari
for index, row in casos.iterrows():
    len_llibres_usuari = len(row['llibres_usuari'])
    len_llibres_recomanats = len(row['llibres_recomanats'])
    estil_literari = make_vector(len_llibres_usuari + len_llibres_recomanats, 2, 4, "estil_literari")
    temes_especifics = make_vector(len_llibres_usuari + len_llibres_recomanats, 2, 4, "temes_especifics")
    complexitat = make_vector(len_llibres_usuari + len_llibres_recomanats, 2, 4, "complexitat")
    caracteristiques = make_vector(len_llibres_usuari + len_llibres_recomanats, 1, 3, "caracteristiques")
    desenvolupament_del_personatge = make_vector(len_llibres_usuari + len_llibres_recomanats, 2, 4, "desenvolupament_del_personatge")
    accio_o_reflexio = make_vector(len_llibres_usuari + len_llibres_recomanats, 1, 3, "accio_o_reflexio")
    longitud = make_vector(len_llibres_usuari + len_llibres_recomanats, 2, 4, "longitud")
    epoca = make_vector(len_llibres_usuari + len_llibres_recomanats, 2, 4, "epoca")
    detall_cientific = make_vector(len_llibres_usuari + len_llibres_recomanats, 2, 4, "detall_cientific")

    

In [31]:
# If column "genres" exists in llibres.pkl then get = False
try:
    llibres = pd.read_pickle(carpeta+pkl_name_ll)
    llibres['genres']
    get = False
except:
    get = True
    df_llibres = pd.read_csv(carpeta+csv_name_ll)

In [32]:
if get:
    fitxer = "/Users/ucemarc/Downloads/goodreads_book_genres_initial.json"

    # Crear un DataFrame vacío para almacenar los libros que coincidan
    df_genres = pd.DataFrame(columns=['book_id', 'genres'])

    with open(fitxer, 'r', encoding='utf-8') as file:
        for line in file:
            book = json.loads(line)
            if book['book_id'] in set_llibres:
                # Only keep the columns "isbn", "book_id", "similar_books", "average_rating", "similar_books", "description", "authors", "isbn13", "num_pages", "publication_year", "title" and "language_code"
                book = {k: book[k] for k in ['book_id', 'genres']}
                # Get only the keys of the dictionary
                book['genres'] = list(book['genres'].keys())
                # Convert the dictionary to a DataFrame
                book = pd.DataFrame([book], index=[0])
                # Add the book to the DataFrame
                df_genres = pd.concat([df_genres, pd.DataFrame(book, index=[0])])
                df_genres.to_csv("genres.csv", index=False)

In [33]:
if get:
    # Merge df_llibres and df_genres on book_id
    df_llibres['book_id'] = df_llibres['book_id'].astype(int)
    df_genres['book_id'] = df_genres['book_id'].astype(int)
    df_llibres= pd.merge(df_llibres, df_genres, on='book_id', how='inner')
    df_llibres.to_csv("llibres.csv", index=False)

In [34]:
if get:
    # Check how many unique genres there are
    unique_genres = set()
    for index, row in df_llibres.iterrows():
        for genre in row['genres']:
            unique_genres.add(genre)
    print(len(unique_genres))
    print(unique_genres)

In [35]:
if get:
    # Replace 'history, historical fiction, biography' to 'history'
    df_llibres['genres'] = df_llibres['genres'].apply(lambda x: ['history' if i == 'history, historical fiction, biography' else i for i in x])
    # Replace 'fantasy, paranormal' to 'fantasy'
    df_llibres['genres'] = df_llibres['genres'].apply(lambda x: ['fantasy' if i == 'fantasy, paranormal' else i for i in x])
    # Replace 'mystery, thriller, crime' to 'mystery'
    df_llibres['genres'] = df_llibres['genres'].apply(lambda x: ['mystery' if i == 'mystery, thriller, crime' else i for i in x])
    # Replace 'comics, graphic' to 'comics'
    df_llibres['genres'] = df_llibres['genres'].apply(lambda x: ['comics' if i == 'comics, graphic' else i for i in x])
    df_llibres.to_csv("llibres.csv", index=False)

In [36]:
if get:
    # Check how many unique genres there are
    unique_genres = set()
    for index, row in df_llibres.iterrows():
        for genre in row['genres']:
            unique_genres.add(genre)
    print(len(unique_genres))
    print(unique_genres)

In [37]:
llibres = pd.read_pickle(carpeta+pkl_name_ll)
casos = pd.read_pickle(carpeta+pkl_name)