## Goodreads users

In [None]:
import requests
import re
import os
from os.path import join
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
import random
import time
import credentials
key = credentials.key

This notebook contains code to identify Goodreads users from two specific countries: Greece and The Netherlands. As a first step, we selected the ISBNs of books written by Dutch and Greek authors. This decision is based on the assumption that there is a degree of overlap between the nationality of the author and the nationalities of the readers.    

In [18]:
dutch_isbns = [
    '9780156004022',
    '9789023462439',
    '9780394542454',
    '9780670856688',
    '9789025445010',
    '9788493544843',
    '9780807613290',
    '9789020425048',
    '9783492232739',
    '9780868740348',
    '9783746619408',
    '9780374114855',
    '9789023422082',
    '9780679419730',
    '9783843076685'
         ]

for isbn in dutch_isbns:
    url = 'https://www.googleapis.com/books/v1/volumes?q=isbn:'+isbn
    response = requests.get(url)
    json_data = response.json()
    title = json_data['items'][0]['volumeInfo']['title']
    authors = json_data['items'][0]['volumeInfo']['authors']
    print(f"{title} by {', '.join(authors)} (isbn: {isbn})")


In the Dutch Mountains by Cees Nooteboom (isbn: 9780156004022)
De avonden / druk 48 by Gerard Kornelis Reve (isbn: 9789023462439)
The Assault by Harry Mulisch (isbn: 9780394542454)
The Discovery of Heaven by Harry Mulisch (isbn: 9780670856688)
Bezonken rood / druk 40 by Jeroen Godfried Marie Brouwers (isbn: 9789025445010)
La Hierba Amarga by Marga Minco (isbn: 9788493544843)
The Laws by Connie Palmen (isbn: 9780807613290)
Nader tot u by Gerard Reve (isbn: 9789020425048)
Ein Schwarm Regenbrachvögel by Maarten 't Hart (isbn: 9783492232739)
Nooit meer slapen by Willem Frederik Hermans (isbn: 9780868740348)
Die Dunkelkammer des Damokles by Willem Frederik Hermans (isbn: 9783746619408)
Blue Mondays by Arnon Grunberg, Arnold Pomerans, Erica Pomerans (isbn: 9780374114855)
Het leven is vurrukkulluk / druk 28 by Remco Wouter Campert (isbn: 9789023422082)
The Vanishing by Tim Krabbé (isbn: 9780679419730)
Max Havelaar by Multatuli (isbn: 9783843076685)


The cell below contains ISBNs of books written by Greek authors. The selection is partly based on the following goodReads list: 

* [Authors from Greece and Cyprus](https://www.goodreads.com/list/show/18791.Books_by_Goodreads_Authors_from_Greece_and_Cyprus)

In [19]:
greek_isbns = ['9781590173503', '9789600400076' ,
               '9781843869931', '9781479314935', 
               '9781620152157', '9781681373300', 
               '9789600500578', '9780671492601'
               '9602936797', '9789600501490', 
               '9600510121', '9781582341286', 
               '9780552772990', '9780684852560', 
               '9781399976930', '9789600310511',
               '9789600304824', '9780571203130', 
               '9780873760485', '9781071526521', 
               '9789607360496', '9781620155011', 
               '9789600410389', '9600500703', 
               '9780671220273']


for isbn in greek_isbns:

    url = 'https://www.googleapis.com/books/v1/volumes?q=isbn:'+isbn
    response = requests.get(url)
    json_data = response.json()
    if 'items' in json_data:
        title = json_data['items'][0]['volumeInfo']['title']
        authors = json_data['items'][0]['volumeInfo']['authors']
        print(f"{title} by {', '.join(authors)} (isbn: {isbn})")



The Murderess by Alexandros Papadiamantis (isbn: 9781590173503)
Το Καπλάνι τής βιτρίνας by Alki Zei, Alkē Zeē (isbn: 9789600400076)
Marginus Morius by Stelios Chalkitis (isbn: 9781843869931)
A Life in a Moment by Stefanos Livos (isbn: 9781479314935)
Spellbound in His Arms by Angel Sefer (isbn: 9781620152157)
Three Summers by Margarita Liberaki (isbn: 9781681373300)
M. Karagatsē Ho kitrinos phakelos by M. Karagatsēs (isbn: 9789600500578)
Живот в гроба : Книга за войната by Stratēs Myribēlēs (isbn: 9789600501490)
I megali chimaira by M. Karagatsis (isbn: 9600510121)
Uncle Petros and Goldbach's Conjecture by Apostolos Doxiadis (isbn: 9781582341286)
Eat, Drink and be Married by Eve Makis (isbn: 9780552772990)
The Last Temptation of Christ by Nikos Kazantzakis (isbn: 9780684852560)
Thirty-Eight Days of Rain by Eva Asprakis (isbn: 9781399976930)
Ζητειται ελπις by Αντώνης Σαμαράκης (isbn: 9789600310511)
Η μητερα του σκυλου by Παυλος Ματεσις (isbn: 9789600304824)
Zorba the Greek by Nikos Kazan

As a second step, the reviews of all the selected books are downloaded. Note that the Goodreads API demands a developer key, which is not included in this public notebook. 

In [None]:

## Functions to facilitate webscraping
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    else:
        return True

user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15'
]

def select_agent():
    n=random.randint(0,len(user_agents)-1)
    return user_agents[n]

In [None]:
dir = 'Reviews'


baseUrl = 'https://www.goodreads.com/book/isbn/'


def get_reviews(isbn):
    
    time.sleep(3)

    apiCall = '{}{}?key={}'.format(baseUrl,isbn,key )

    print(apiCall)

    headers = {'User-Agent': select_agent() }
    response = requests.get( apiCall, headers=headers)
    print(response.status_code)
    
    ## Save data about ISBN
    with open(os.path.join('Data',f'{isbn}.xml'),'w',encoding='utf-8') as outhtml:
        outhtml.write(response.text)
        
    ## Find reviews
    root = ET.fromstring(response.text)
    title = root.find( 'book/title' ).text
    reviewsWidget = root.find( 'book/reviews_widget' ).text

    soup = BeautifulSoup( reviewsWidget ,"lxml")
    links = soup.find_all("iframe")
    out = open( os.path.join( 'Reviews', f'reviews_{ isbn }.txt') , 'w' )
    pages_left = True

    for l in links:
        url = l.get("src")
        
        i = 1
        while pages_left == True:
            if re.search( r'[&]page' , url,re.IGNORECASE):
                url = url[:url.index('&page')]
            url += '&page=' + str(i)
            print(f" {url}")
            
            headers = {'User-Agent': select_agent() }
            
            response = requests.get( url , headers=headers)
            print(response.status_code)
            if response:
                response.encoding = 'utf-8'
                soup = BeautifulSoup( response.text ,"lxml")
                #print(soup.prettify)
   
                texts = soup.findAll(text=True)
                visible_texts = filter(tag_visible, texts)  
                full_text = u" ".join(t.strip() for t in visible_texts)
                if re.search(r'No\s+reviews\sfound',full_text,re.IGNORECASE):
                    pages_left = False
                else:
                    i+=1
                    with open(os.path.join( dir, f'reviews_{i}_html_{ isbn }.html'),'w',encoding='utf-8') as out:
                        out.write(str(soup.prettify()))
                        
    out.close()


In [None]:
for isbn in dutch_isbns:
    try:
        get_reviews(isbn)
    except:
        print(f'Problem with ISBN {isbn}')
        
for isbn in dutch_isbns:
    try:
        get_reviews(isbn)
    except:
        print(f'Problem with ISBN {isbn}')

The cell below identifies the IDs of all the users who have written reviews of the books by the Greek and Dutch authors. 

In [None]:
dir = 'Reviews'
users = []

reviews = os.listdir(dir)
reviews = [review for review in reviews if re.search('html$', review)]

for review in reviews:
    time.sleep(3)
    out = open(review,'w',encoding='utf-8')
    print(review)

    path = os.path.join(dir,review)

    file = open(path,encoding='utf-8')
    html_page = file.read()

    soup = BeautifulSoup(html_page,"lxml")
    reviewLinks = soup.find_all("link")
    
    for r in reviewLinks:
        #print(r)
        reviewUrl = r.get("href")

        if re.search( r'goodreads.*[/]review[/]show' , reviewUrl ):
            print(reviewUrl)

            headers = {'User-Agent': select_agent() }
            response = requests.get(reviewUrl,headers=headers)
            if response:
                response.encoding = 'utf-8'
                soup = BeautifulSoup( response.text ,"lxml")
                review = soup.find( 'div' , itemprop='reviewBody' )
                full_text = re.sub( '\.' , '. ' , str(review.text) )
                full_text = re.sub( '!' , '! ' , str(full_text) )
                full_text = re.sub( '\s+' , ' ' , str(full_text) )
                out.write(full_text)
                
                user = soup.find( 'a' , {'class':'userReview'} )
                user_link = user.get('href')
                users.append(user_link)
                
    out.close()

As next step, we request all the information about these users user the API available via [https://www.goodreads.com/user/show/](https://www.goodreads.com/user/show/).

In [None]:
for u in users:
    u = os.path.basename(u)
    if re.search(r'[-]',u):
        u = u[:u.index('-')]
    print(u)
    url = f'https://www.goodreads.com/user/show/{u}.xml?key={key}'
    headers = {'User-Agent': select_agent() }
    response = requests.get(url,headers=headers)
    print(url)
    print(response.status_code)
    if response:
        response.encoding = 'utf-8'
        with open(f'Users/{u}.xml','w',encoding='utf-8') as out:
            out.write(response.text)
            

As a final step, we select the users who have specified in the &lt;location&gt; field that they reside either in Greece or in the Netherlands.

In [None]:
greek_users = []
dutch_users = []

dir = 'Users'

users = os.listdir(dir)
users = [user for user in users if re.search('xml$', user)]

for user_xml in users:
    xml = open(join(dir,user_xml),encoding='utf-8').read()
    soup = BeautifulSoup( xml ,"lxml")
    user_id = soup.find("id").text.strip()
    name = soup.find("name").text.strip()
    print(user_id)
    loc = soup.find("location").text.strip()
    if re.search(r'\w+' ,loc):
        print(loc)
        if re.search(r'gree[ck]',loc,re.IGNORECASE):
            greek_users.append( [user_id,name,loc] )

        elif re.search(r'nether',loc,re.IGNORECASE):
            dutch_users.append( [user_id,name,loc] )
    

Using the method implemented in this notebook, we were able top identify 377 Greek goodReads users and 345 Dutch goodReads users. The IDS and the names of these users are saved in files named "dutch_users.txt" and "greek_users.txt"

In [None]:
print(len(greek_users))
print(len(dutch_users))

In [None]:
with open('dutch_users.txt','w',encoding='utf=8') as out:
    out.write('id\nname\tlocation\n')
    for u in dutch_users:
        out.write(f'{u[0]}\t{u[1]}\t{u[2]}\n')
        
with open('greek_users.txt','w',encoding='utf=8') as out:
    out.write('id\nname\tlocation\n')
    for u in greek_users:
        out.write(f'{u[0]}\t{u[1]}\t{u[2]}\n')