In [1]:
import os
from os.path import join
import pandas as pd
import re
from tqdm import tqdm
from datetime import datetime
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
stopwords = stopwords.words('english')
punctuation_marks = [char for char in string.punctuation]
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
ana = SentimentIntensityAnalyzer()


def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [2]:
directory = 'Lists'
books = dict()

files = os.listdir(directory)
files = [file for file in files if not(re.search(r'^[.]',file))]
for list in files:

    path = os.path.join(directory,list)
    df = pd.read_csv(path,sep='\t')
    for i,row in df.iterrows():
        asin = row['amazon_product_url']
        asin = os.path.basename(asin)
        asin = asin[:asin.index('?')]
        books[asin] = row['primary_isbn13']

print(len(books))

182


In [3]:
from datasets import load_dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Books")

Found cached dataset amazon-reviews-2023 (/Users/verhaarpaf/.cache/huggingface/datasets/McAuley-Lab___amazon-reviews-2023/raw_review_Books/0.0.0/16b76e0823d73bb8cff1e9c5e3e37dbc46ae3daee380417ae141f5e67d3ea8e8)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
full_text = dict()

for record in tqdm(dataset["full"]):
    if record['asin'] in books:
        isbn = books[record['asin']]
        full_text[isbn] = full_text.get(isbn,'') + record['title'] + ' ' + record['text'] + '\n'

100%|██████████████████████████████| 2863589/2863589 [03:10<00:00, 15011.69it/s]


In [5]:
out_dir = 'Amazon_reviews'
for isbn in full_text: 
    path = join(out_dir,f'reviews_{isbn}.txt')
    out = open(path,'w',encoding='utf-8')
    plain_text = remove_html_tags(full_text[isbn].strip())
    out.write(plain_text)
    out.close()

In [6]:
review_id = 0
reviews = []

for record in tqdm(dataset["full"]):
    if record['asin'] in books:

        data = dict()
        review_id+=1
        data['review_id'] = "{:07d}".format(review_id)
        data['user_id'] = record['user_id']
        data['rating'] = record['rating']
        data['asin'] = record['asin']
        data['isbn'] = books[record['asin']]
        ts = int(record['timestamp'])/1000
        date = datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
        data['date'] = date
        
        ## textual analysis
        text = record['text'].lower()
        words = word_tokenize(text)
        words = [word for word in words if word not in punctuation_marks]
        data['nr_tokens'] = len(words)
        sentences = sent_tokenize(text)
        data['nr_sentences'] = len(sentences)
        
        sum_positive = 0
        sum_negative = 0
        sum_compound = 0
        
        positive_words = 0
        negative_words = 0
        
        
        for word in words:
            sum_positive += ana.polarity_scores(word)['pos']
            if ana.polarity_scores(word)['pos']>0.7:
                positive_words += 1
            if ana.polarity_scores(word)['pos']<-0.7:
                negative_words += 1
            sum_negative += ana.polarity_scores(word)['neg']
            sum_compound += ana.polarity_scores(word)['compound']
            
        data['positive'] = sum_positive
        data['negative'] = sum_negative
        data['compound'] = sum_compound   
        reviews.append(data)


100%|██████████████████████████████| 2863589/2863589 [03:27<00:00, 13822.38it/s]


In [7]:
df = pd.DataFrame(reviews)

In [16]:
isbn = df.groupby('isbn')['review_id'].count().index.values.tolist()
nr_reviews = df.groupby('isbn')['review_id'].count().tolist()
average_rating = df.groupby('isbn')['rating'].mean().tolist()
nr_tokens = df.groupby('isbn')['nr_tokens'].sum().tolist()
nr_sentences = df.groupby('isbn')['nr_sentences'].sum().tolist()
positive = df.groupby('isbn')['positive'].sum().tolist()
negative = df.groupby('isbn')['negative'].sum().tolist()
compound = df.groupby('isbn')['compound'].sum().tolist()



In [21]:
df_isbn = pd.DataFrame( 
{
    'isbn':isbn,
    'nr_reviews':nr_reviews,    
    'average_rating':average_rating,
    'nr_tokens':nr_tokens,
    'nr_sentences':nr_sentences,
    'positive':positive,
    'negative':negative,
    'compound':compound
}

)

df_isbn['average_tokens'] = df_isbn['nr_tokens'] / df_isbn['nr_reviews']
df_isbn['positive'] = df_isbn['positive'] / df_isbn['nr_tokens']
df_isbn['negative'] = df_isbn['negative'] / df_isbn['nr_tokens']
df_isbn['compound'] = df_isbn['compound'] / df_isbn['nr_tokens']
df_isbn.head()

Unnamed: 0,isbn,nr_reviews,average_rating,nr_tokens,nr_sentences,positive,negative,compound,average_tokens
0,9780062356345,12,4.5,977,67,0.047083,0.032753,0.005343,81.416667
1,9780062434029,8,4.625,330,18,0.081818,0.018182,0.027911,41.25
2,9780062667632,36,2.416667,3800,221,0.04,0.033684,0.00373,105.555556
3,9780062671189,8,4.375,1052,54,0.052281,0.019962,0.015109,131.5
4,9780062834843,45,3.777778,3143,187,0.059179,0.031499,0.010964,69.844444


In [23]:
df_isbn.to_csv('amazon_statistics.csv')