### The purpose of this notebooks is to narrow down the training

In [1]:
import pandas as pd

In [2]:
csv_path = "goodreads/ratings.csv"
df = pd.read_csv(csv_path, engine="python")
print(df.head())
print(df.shape)

   user_id  book_id  rating
0        1      258       5
1        2     4081       4
2        2      260       5
3        2     9296       5
4        2     2318       3
(5976479, 3)


In [3]:
df = df.rename(columns={"user_id": "user", "book_id": "item", "rating": "rating"})

In [4]:
df.groupby('user')['rating'].count()

user
1        117
2         65
3         91
4        134
5        100
        ... 
53420    110
53421    110
53422    130
53423     77
53424    133
Name: rating, Length: 53424, dtype: int64

In [5]:
nums_rating_per_user = df.groupby("user", as_index=True).size()
user_frame = nums_rating_per_user.to_frame()
user_frame.columns = ["item_cnt"]

In [6]:
user_frame.head()

Unnamed: 0_level_0,item_cnt
user,Unnamed: 1_level_1
1,117
2,65
3,91
4,134
5,100


In [7]:
user_frame.sort_values(by=["item_cnt"], ascending=False)

Unnamed: 0_level_0,item_cnt
user,Unnamed: 1_level_1
30944,200
12874,200
52036,199
12381,199
28158,199
...,...
32128,21
40753,21
51725,21
43675,20


In [8]:
book_df = pd.read_csv("goodread_books/books.csv")

In [9]:
nums_rating_per_item = df.groupby("item", as_index=True).size()
item_frame = nums_rating_per_item.to_frame()
item_frame.columns = ["user_cnt"]

In [10]:
item_frame.head()

Unnamed: 0_level_0,user_cnt
item,Unnamed: 1_level_1
1,22806
2,21850
3,16931
4,19088
5,16604


In [11]:
sorted_item_frame = item_frame.sort_values(by=["user_cnt"], ascending=False)

In [12]:
item_ids = sorted_item_frame.iloc[:2000, :].index

In [13]:
item_ids

Index([   1,    2,    4,    3,    5,   17,   20,   18,   23,    7,
       ...
       2674, 3359, 2210, 2382, 1047, 2123, 1884, 1006, 2632, 2366],
      dtype='int64', name='item', length=2000)

In [14]:
sub_df = pd.read_csv("goodread_books/subset_to_db.csv", engine="python")
sub_df.head()

Unnamed: 0,book_id,isbn,authors,original_publication_year,original_title,title,average_rating,ratings_count,image_url
0,1,439023483,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",4.34,4780653,https://images.gr-assets.com/books/1447303603m...
1,2,439554934,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,4.44,4602479,https://images.gr-assets.com/books/1474154022m...
2,3,316015849,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",3.57,3866839,https://images.gr-assets.com/books/1361039443m...
3,4,61120081,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,4.25,3198671,https://images.gr-assets.com/books/1361975680m...
4,5,743273567,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,3.89,2683664,https://images.gr-assets.com/books/1490528560m...


In [15]:
book_to_scrape_titles = sub_df[sub_df["book_id"].isin(item_ids)][["original_title", "isbn"]]

In [16]:
# book_to_scrape_titles = book_to_scrape_titles
book_to_scrape_titles.astype({"original_title": str, "isbn": str}).dtypes

original_title    object
isbn              object
dtype: object

In [17]:
book_to_scrape_titles[book_to_scrape_titles["original_title"] == "The Silence of the Lambs"]

Unnamed: 0,original_title,isbn
208,The Silence of the Lambs,99446782


In [18]:
book_to_scrape_titles.dropna(subset=['original_title'], inplace=True) 

In [19]:
book_to_scrape_titles.iloc[[209]]

Unnamed: 0,original_title,isbn
212,Die Verwandlung,553213695


In [20]:
df = book_to_scrape_titles

In [21]:
latin_char_regex = r'^[A-Za-z0-9\s\.\,\:\;\!\?\'\-\(\)]+$'
filtered_df = df[df['original_title'].str.contains(latin_char_regex, na=False)]

In [23]:
filtered_df.head()
filtered_df.to_csv("books/books_title_to_scrape_isbn.csv",sep="|" , index=False)

In [22]:
filtered_df[filtered_df["original_title"] == 'The Lion, the Witch and the Wardrobe']

Unnamed: 0,original_title,isbn
36,"The Lion, the Witch and the Wardrobe",60764899


In [24]:
print(filtered_df.iloc[[34]]["original_title"])


36    The Lion, the Witch and the Wardrobe
Name: original_title, dtype: object
