# Dask


In [1]:
import os
import dask.dataframe as ddf

from dask.distributed import LocalCluster, Client

In [2]:
cluster = LocalCluster()
client = Client(cluster)

In [3]:
file_lst = [
    file.path
    for file in os.scandir("../dossier/")
    if file.is_file() and file.path.endswith(".csv")
]

In [4]:
def read_files(files_list: list, **kwargs) -> ddf:
    """Read multiple files into a Dask DataFrame

    Args:
        files_list (list): List of files to read

    Returns:
        ddf: Delayed Dask DataFrame object
    """
    return ddf.read_csv(files_list, **kwargs)

In [5]:
delayed_obj = read_files(
    files_list=file_lst,
    usecols=["isbn", "isbn13", "title", "authors", "language", "publisher", "image"],
    dtype={"isbn": "object"},
)

In [6]:
books = delayed_obj.compute()

In [7]:
books.info()

<class 'pandas.core.frame.DataFrame'>
Index: 271197 entries, 0 to 49967
Data columns (total 7 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   publisher  270950 non-null  object
 1   language   271190 non-null  object
 2   image      268139 non-null  object
 3   authors    271175 non-null  object
 4   title      271195 non-null  object
 5   isbn13     271197 non-null  int64 
 6   isbn       271197 non-null  object
dtypes: int64(1), object(6)
memory usage: 16.6+ MB


In [8]:
books.head(2)

Unnamed: 0,publisher,language,image,authors,title,isbn13,isbn
0,"Fischer (Tb.), Frankfurt",de,https://images.isbndb.com/covers/72/25/9783596...,"['Gordimer, Nadine']",Clowns im Glück. Erzählungen.,9783596257225,3596257220
1,Doubleday,en,https://images.isbndb.com/covers/50/20/9780385...,"['Liddell, Felix H.']",I Hear a Symphony,9780385475020,385475020


In [9]:
books.isnull().sum()

publisher     247
language        7
image        3058
authors        22
title           2
isbn13          0
isbn            0
dtype: int64

In [10]:
books = books.dropna(subset=["publisher", "language", "authors", "title"])

In [11]:
ratings = ddf.read_csv("../data/raw/Ratings.csv").compute()

In [12]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149780 non-null  int64 
 1   ISBN         1149780 non-null  object
 2   Book-Rating  1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [13]:
ratings.columns = ratings.columns.str.replace("-", "_").str.lower()

In [14]:
ratings.head(2)

Unnamed: 0,user_id,isbn,book_rating
0,276725,034545104X,0
1,276726,0155061224,5


### Most Popular items


In [15]:
books[["title"]].value_counts()

title                                                                              
The Secret Garden                                                                      22
Little Women                                                                           19
Dracula                                                                                18
The Night Before Christmas                                                             18
The Hobbit                                                                             17
                                                                                       ..
History Channel Presents The Real Scorpion King (The History Channel Presents)          1
History Channel: Haunted History (The History Channel Presents)                         1
History Channel: History's Mysteries: Dead, Doomed And Buried                           1
History Channel: Modern Marvels: Inventions That Rocked The World (History Channel)     1
万历十五年           

In [16]:
explicit_rating = ratings.query("book_rating != 0")

#### Merge into books dataframe


In [17]:
ratings_books = explicit_rating.merge(books, on=["isbn"])

In [18]:
ratings_books.head(2)

Unnamed: 0,user_id,isbn,book_rating,publisher,language,image,authors,title,isbn13
0,276726,0155061224,5,Wadsworth Publishing,en,https://images.isbndb.com/covers/12/24/9780155...,"['Rae, Judith']",Rites of Passage,9780155061224
1,276729,052165615X,3,Cambridge University Press,en,https://images.isbndb.com/covers/61/53/9780521...,"['Prowse, Philip']",Help! Level 1 (Cambridge English Readers),9780521656153


Get rid of the `ISBN` column, and keep the `ISBN13` column in its stead


In [19]:
ratings_books = ratings_books.loc[
    :,
    [
        "user_id",
        "isbn13",
        "title",
        "authors",
        "publisher",
        "language",
        "image",
        "book_rating",
    ],
]

### Most liked items

Find the most popular books ranked by their average rating


In [20]:
avg_rating = (
    ratings_books[["title", "book_rating"]]
    .groupby(["title"])
    .agg({"book_rating": "mean"})
    .sort_values(by="book_rating", ascending=False)
)
avg_rating.reset_index()

Unnamed: 0,title,book_rating
0,"Molly's Surprise: A Christmas Story, Book Thre...",10.0
1,The Art Of Playing Mythos The Cthulhu Collecta...,10.0
2,The Archaic Revival: Speculations on Psychedel...,10.0
3,The Archer King,10.0
4,The Architects of Hyperspace,10.0
...,...,...
135999,The New Beverly Hills Diet: The latest weight-...,1.0
136000,Let's Go 2000: Europe: The World's Bestselling...,1.0
136001,Rencontre au bord du fleuve,1.0
136002,Honey for a Child's Heart: The Imaginative Use...,1.0


Books that are too few in number can end up skewing ratings at both ends.


In [21]:
(
    ratings_books["title"]
    == "I Hate the Dallas Cowboys: And Who Elected Them America's Team Anyway?"
).sum()

1

In [22]:
(
    ratings_books["title"]
    == "Molly's Surprise: A Christmas Story, Book Three (The American Girls Collection)"
).sum()

1

### Filtering

1. Filter out books with fewer than 10 reviews
2. Filter out books with under-represented languages
3. Filter out users with fewer than 5 reviews


In [23]:
freq = ratings_books["title"].value_counts()
frequently_reviewed = freq[freq > 10].index

In [24]:
books_df = ratings_books[ratings_books["title"].isin(frequently_reviewed)]

In [25]:
books_df[["title"]].value_counts()

title                                                                    
The Lovely Bones                                                             754
Harry Potter and the Sorcerer's Stone                                        629
Wild Animus: A Novel                                                         581
A Painted House                                                              562
Snow Falling on Cedars                                                       532
                                                                            ... 
Gone With the Wind                                                            11
The Blue Flower                                                               11
The Blue Day Book: A Lesson in Cheering Yourself Up                           11
Chicks in Chainmail                                                           11
Stupid White Men: ...And Other Sorry Excuses for the State of the Nation!     11
Name: count, Length: 5128, dtype: i

In [26]:
books_df["language"].value_counts()

language
en       154856
de         1223
es          361
fr          262
it          195
en_US         8
ru            2
hi            1
Name: count, dtype: int64

In [27]:
books_df.loc[books_df["language"] == "en_US", "language"] = "en"

In [28]:
books_df["language"].value_counts()

language
en    154864
de      1223
es       361
fr       262
it       195
ru         2
hi         1
Name: count, dtype: int64

In [29]:
minor_langs = books_df.loc[
    (books_df["language"] == "ru") | (books_df["language"] == "hi")
].index

In [30]:
books_df = books_df.drop(minor_langs, axis=0)

In [31]:
books_df["language"].value_counts()

language
en    154864
de      1223
es       361
fr       262
it       195
Name: count, dtype: int64

In [32]:
usr_freq = books_df["user_id"].value_counts()
freq_raters = usr_freq[usr_freq > 5].index

In [33]:
books_df = books_df[books_df["user_id"].isin(freq_raters)]

#### Average rating of remaining books


In [34]:
avg_remain = (
    books_df[["title", "book_rating"]]
    .groupby(["title"])
    .agg({"book_rating": "mean"})
    .sort_values(by="book_rating", ascending=False)
)
avg_remain.reset_index()

Unnamed: 0,title,book_rating
0,Postmarked Yesteryear: 30 Rare Holiday Postcards,10.000000
1,Liebesleben (German Edition),10.000000
2,"The Secret Daughter: Raising Cane, Book 2 (Har...",10.000000
3,Purple Cow: Transform Your Business by Being R...,10.000000
4,Dilbert: A Book of Postcards,9.923077
...,...,...
5111,The Coldest Winter Ever,4.666667
5112,Wild Animus: A Novel,4.036364
5113,Confessions of a Sociopathic Social Climber: T...,3.923077
5114,Le Crime de L'Orient-Express (Le Livre de Poch...,3.000000


#### V


In [35]:
grouped = books_df.groupby("title")[["book_rating"]].agg(
    {"book_rating": [("num_ratings", "count")]}
)
grouped.columns = grouped.columns.droplevel()
grouped = grouped.reset_index().sort_values(by="num_ratings", ascending=False)
grouped

Unnamed: 0,title,num_ratings
1649,Harry Potter and the Sorcerer's Stone,390
4122,The Lovely Bones,372
1647,Harry Potter and the Order of the Phoenix (Boo...,321
134,A Painted House,314
3843,The Firm,299
...,...,...
2556,Novecento: Un Monologo (Universale Economica F...,1
1464,Free,1
1473,Friedhof der Kuscheltiere. Roman.,1
4695,Theos Reise. Roman über die Religionen der Welt.,1


### Non-Personalized Recommendations


In [36]:
from itertools import permutations
import pandas as pd

In [37]:
def create_pairs(col) -> ddf:
    """Return a dataframe with a pair of books that are frequently read together and the number of pairs observed

    Args:
        col (_type_): column

    Returns:
        ddf: pandas dataframe
    """
    pairs = pd.DataFrame(list(permutations(col, 2)), columns=["book_a", "book_b"])
    return pairs

In [38]:
book_pairs = books_df.groupby("user_id")["title"].apply(create_pairs)

In [39]:
book_pairs = book_pairs.reset_index(drop=True)
book_pairs.head()

Unnamed: 0,book_a,book_b
0,The Alibi,The Beach House
1,The Alibi,A Kiss Remembered
2,The Alibi,The Short Forever (A Stone Barrington Novel)
3,The Alibi,Dead Aim
4,The Alibi,Angels & Demons


In [40]:
pair_counts = book_pairs.groupby(["book_a", "book_b"]).size()

In [41]:
pair_counts_df = (
    pair_counts.to_frame(name="size")
    .reset_index()
    .sort_values(by="size", ascending=False)
)

In [42]:
true_pairs = pair_counts_df[pair_counts_df["book_a"] != pair_counts_df["book_b"]]

In [43]:
true_pairs.head()

Unnamed: 0,book_a,book_b,size
2025844,Harry Potter and the Order of the Phoenix (Boo...,Harry Potter And The Goblet Of Fire,175
2010197,Harry Potter And The Goblet Of Fire,Harry Potter and the Order of the Phoenix (Boo...,175
2025848,Harry Potter and the Order of the Phoenix (Boo...,Harry Potter The Illustrated 4 Books Collectio...,158
2019096,Harry Potter The Illustrated 4 Books Collectio...,Harry Potter and the Order of the Phoenix (Boo...,158
2031192,Harry Potter and the Sorcerer's Stone,Harry Potter and the Order of the Phoenix (Boo...,146


In [44]:
harry_potter = true_pairs[
    true_pairs["book_a"] == "Harry Potter And The Goblet Of Fire"
].nlargest(10, "size")
harry_potter

Unnamed: 0,book_a,book_b,size
2010197,Harry Potter And The Goblet Of Fire,Harry Potter and the Order of the Phoenix (Boo...,175
2010195,Harry Potter And The Goblet Of Fire,Harry Potter The Illustrated 4 Books Collectio...,129
2010199,Harry Potter And The Goblet Of Fire,Harry Potter and the Sorcerer's Stone,113
2010192,Harry Potter And The Goblet Of Fire,Harry Potter And The Prisoner Of Azkaban,83
2010200,Harry Potter And The Goblet Of Fire,Harry Potter and the Sorcerer's Stone (1),61
2010196,Harry Potter And The Goblet Of Fire,Harry Potter and the Chamber of Secrets,51
2011769,Harry Potter And The Goblet Of Fire,The Fellowship of the Ring (The Lord of the Ri...,33
2011779,Harry Potter And The Goblet Of Fire,The Firm,24
2011650,Harry Potter And The Goblet Of Fire,The Client,21
2012624,Harry Potter And The Goblet Of Fire,Where the Heart Is,21


### Serialize columns


In [45]:
books_df.user_id = pd.Categorical(books_df.user_id)
books_df["userId"] = books_df.user_id.cat.codes

In [46]:
books_df.isbn13 = pd.Categorical(books_df.isbn13)
books_df["bookId"] = books_df.isbn13.cat.codes

In [47]:
books_df = books_df.rename(columns={"book_rating": "rating"})

In [48]:
books_df.head()

Unnamed: 0,user_id,isbn13,title,authors,publisher,language,image,rating,userId,bookId
217,276964,9780345354624,The Terminal Man,"['Crichton, Michael']",Ballantine Books,en,https://images.isbndb.com/covers/46/24/9780345...,10,5299,1740
222,276964,9780440220602,The Chamber,"['Grisham, John']",Dell,en,https://images.isbndb.com/covers/06/02/9780440...,9,5299,3798
224,276964,9780684867625,The Girl Who Loved Tom Gordon : A Novel,"['King, Stephen']",Scribner,en,https://images.isbndb.com/covers/76/25/9780684...,8,5299,7311
226,276964,9780812550306,"The Fires of Heaven (The Wheel of Time, Book 5)","['Jordan, Robert']",Tor Fantasy,en,https://images.isbndb.com/covers/03/06/9780812...,10,5299,8246
228,276964,9780886773748,Tailchaser's Song,"['Williams, Tad']",DAW,en,https://images.isbndb.com/covers/37/48/9780886...,7,5299,8509


In [49]:
clean_df = books_df.loc[:, ["userId", "bookId", "title", "language", "image", "rating"]]

### Save to Parquet


In [50]:
from pathlib import Path

In [51]:
filepath = Path().cwd().parent.joinpath("data", "processed", "clean_df")

In [52]:
clean_df.to_parquet(filepath, index=False)

In [53]:
client.close()