# Downloading and Extracting the Dataset

In [None]:
#to download and extract the trajectory files
import urllib.request
import zipfile
import os

with open('download_link.txt', 'r') as link_file:
    download_link = link_file.read()

url = download_link
file_name = 'goodreads_dataset'
zip_name = file_name+'.zip'

#downloading the first trajectory dataset I could find
urllib.request.urlretrieve(url, zip_name)

#extracting the zipped files
with zipfile.ZipFile(zip_name, 'r') as zip_ref:
    zip_ref.extractall()
#removing the zip
os.remove(zip_name)
#listing files and directories in cwd
os.listdir()

['.config',
 'goodreads_sample_submission.csv',
 'goodreads_test.csv',
 'drive',
 'goodreads_train.csv',
 'sample_data']

# Preliminary simple EDA

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_train = pd.read_csv('goodreads_train.csv')

Looking at the cardinality of the target variable

In [None]:
df_train['rating'].unique()

array([5, 3, 0, 4, 2, 1])

Seeing how many features we have

In [None]:
df_train.columns

Index(['user_id', 'book_id', 'review_id', 'rating', 'review_text',
       'date_added', 'date_updated', 'read_at', 'started_at', 'n_votes',
       'n_comments'],
      dtype='object')

How a review looks

In [None]:
print(df_train['review_text'].loc[0])

This is a special book. It started slow for about the first third, then in the middle third it started to get interesting, then the last third blew my mind. This is what I love about good science fiction - it pushes your thinking about where things can go. 
 It is a 2015 Hugo winner, and translated from its original Chinese, which made it interesting in just a different way from most things I've read. For instance the intermixing of Chinese revolutionary history - how they kept accusing people of being "reactionaries", etc. 
 It is a book about science, and aliens. The science described in the book is impressive - its a book grounded in physics and pretty accurate as far as I could tell. (view spoiler)[Though when it got to folding protons into 8 dimensions I think he was just making stuff up - interesting to think about though. 
 But what would happen if our SETI stations received a message - if we found someone was out there - and the person monitoring and answering the signal on our

How the dataset looks

In [None]:
df_train.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
1,8842281e1d1347389f2ab93d60773d4d,16981,a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,Mon Dec 05 10:46:44 -0800 2016,Wed Mar 22 11:37:04 -0700 2017,,,1,0
2,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,0
3,8842281e1d1347389f2ab93d60773d4d,27161156,ced5675e55cd9d38a524743f5c40996e,0,Recommended reading to understand what is goin...,Wed Nov 09 17:37:04 -0800 2016,Wed Nov 09 17:38:20 -0800 2016,,,5,1
4,8842281e1d1347389f2ab93d60773d4d,25884323,332732725863131279a8e345b63ac33e,4,"I really enjoyed this book, and there is a lot...",Mon Apr 25 09:31:23 -0700 2016,Mon Apr 25 09:31:23 -0700 2016,Sun Jun 26 00:00:00 -0700 2016,Sat May 28 00:00:00 -0700 2016,9,1


How many books we have:

In [None]:
len(df_train['book_id'].unique())

25474

In [None]:
#df_train[['user_id', 'book_id', 'review_id', 'rating', 'review_text']].to_csv('goodreads_reduced.csv')

In [None]:
df_test = pd.read_csv('goodreads_test.csv')

In [None]:
entire_bookid_list = pd.concat((df_train['book_id'], df_test['book_id'])).reset_index(drop=True).unique()

In [None]:
len(entire_bookid_list)

25475

# Loading previously saved data
I couldn't scrape in one go, so I load scraping data of previous sessions

In [None]:
import pickle

loading_scrapped = open("drive/MyDrive/data.pkl", "rb")
scrapped_info = pickle.load(loading_scrapped)
loading_scrapped.close()
len(scrapped_info)

25474

# Scraping the first genre of each books
Works if the genre is missing by counting it as 'Not Found'

In [None]:
from lxml import etree

books_to_scrap = entire_bookid_list
not_scrapped_yet = [x for x in books_to_scrap if x not in scrapped_info.keys()]

for book_code in not_scrapped_yet:
    url =  f"https://www.goodreads.com/book/show/{book_code}"

    #number of attempts to connect to the url = 20
    for attempt in range(20):
        try:
            response = urllib.request.urlopen(url)
            break
        except:
            continue
    
    #setting the parser
    htmlparser = etree.HTMLParser()
    try:
        #if the response is not the one expected the link doesn't work
        tree = etree.parse(response, htmlparser)
        #if the position in the html tree has no text it has not a first genre voted by the users
        genre = tree.xpath('/html/body/div[2]/div[3]/div[1]/div[2]/div[5]/div[6]/div/div[2]/div/div[1]/div[1]/a')[0].text
    except:
        genre = 'Not Found'

    #adding the data to the dictionary
    scrapped_info[book_code] = genre

    #saving the data as a pickle file
    a_file = open("drive/MyDrive/data.pkl", "wb")
    pickle.dump(scrapped_info, a_file)
    a_file. close()

In [None]:
len(scrapped_info)

25475

# Counting books by genre

In [None]:
genre_n_books = pd.DataFrame([(v, 1) for v in scrapped_info.values()], columns = ['genre', 'book_count']).groupby('genre').agg(sum).sort_values('book_count', ascending = False)
genre_n_books.head()

Unnamed: 0_level_0,book_count
genre,Unnamed: 1_level_1
Romance,5817
Fantasy,4888
Not Found,3299
Young Adult,2967
Fiction,1482


# Mapping the genres to the original dataframe

In [None]:
df_train['genre'] = df_train['book_id'].map(scrapped_info)
df_test['genre'] = df_test['book_id'].map(scrapped_info)

# Number of reviews by Genre

In [None]:
def get_reviews_count(df_column):
    #counting the reviews by genre
    reviews_genre_count = np.unique(df_column.values, return_counts = True)
    #creating a corresponing list between genre and counts
    n_reviews_by_genre = list(zip(reviews_genre_count[0], reviews_genre_count[1]))
    #sorting it by n° of counts
    sorted_genres_by_count = sorted(n_reviews_by_genre, key=lambda x: -x[1])
    #getting the total into a list in the same format
    total = [('Total:', f'{sum(reviews_genre_count[1]):,}')]
    #adding everything to a list with formatting the count with commas indicating the thousands
    final_reviews_genre = total+[(genre, f'{reviews:,}') for genre, reviews in sorted_genres_by_count]

    #visualizing everything as a dataframe
    visual_genre_reviews = pd.DataFrame(final_reviews_genre, columns=['genre', 'n_reviews']).set_index('genre')
    return visual_genre_reviews

## Training set

In [None]:
get_reviews_count(df_train['genre'])

Unnamed: 0_level_0,n_reviews
genre,Unnamed: 1_level_1
Total:,900000
Fantasy,195266
Not Found,181413
Romance,161726
Young Adult,127018
...,...
Social Movements,8
Animals,7
Novels,7
Retellings,7


## Test set

In [None]:
get_reviews_count(df_test['genre'])

Unnamed: 0_level_0,n_reviews
genre,Unnamed: 1_level_1
Total:,478033
Fantasy,109687
Not Found,98479
Romance,76189
Young Adult,73043
...,...
Social Movements,3
Superheroes,3
Parenting,2
Politics,2


# Edit 'Not Found' to 'NaN'
To drop them easily


In [None]:
df_train['genre'] = df_train['genre'].replace('Not Found', np.nan)
df_test['genre'] = df_test['genre'].replace('Not Found', np.nan)

# Checking NaN values

In [None]:
df_train.isna().sum()

user_id              0
book_id              0
review_id            0
rating               0
review_text          0
date_added           0
date_updated         0
read_at          91766
started_at      274297
n_votes              0
n_comments           0
genre           181413
dtype: int64

In [None]:
df_test.isna().sum()

user_id              0
book_id              0
review_id            0
review_text          0
date_added           0
date_updated         0
read_at          42478
started_at      143044
n_votes              0
n_comments           0
genre            98479
dtype: int64

# Saving the Dataframes as csv

In [None]:
training_fname = 'gr_training_set.csv'
df_train.to_csv('gr_training_set.csv', index = False)

In [None]:
test_fname = 'gr_test_set.csv'
df_test.to_csv('gr_test_set.csv', index = False)

# Compressing everything into a zip

In [None]:
with zipfile.ZipFile('Dataset_TXA.zip', 'w') as zipObj:
   zipObj.write(training_fname)
   zipObj.write(test_fname)
   zipObj.write('goodreads_sample_submission.csv')