#**DATA PREPARATION**

Importing useful libraries

In [None]:
import gzip
import json
import re
import os
import sys
import numpy as np
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
directory = '/content/gdrive/MyDrive/Text Mining/Progetto_Brambatti_Fracchia_Privitera/Data/goodreads_reviews_mystery_thriller_crime.json.gz'

In [None]:
with gzip.open(directory, 'rt') as f: #rt--> read text
  lines = f.readlines()

# Loading each JSON object separately
data_list = []
for line in lines:
    # Loading the JSON object from the current line
    json_object = json.loads(line)
    data_list.append(json_object)

# Now 'data_list' contains a list of JSON objects
print(data_list[:10])

[{'user_id': '8842281e1d1347389f2ab93d60773d4d', 'book_id': '6392944', 'review_id': '5e212a62bced17b4dbe41150e5bb9037', 'rating': 3, 'review_text': "I haven't read a fun mystery book in a while and not sure I've ever read Poirot. Was looking for a fun read set in France while I was on holiday there and this didn't disappoint! Fast paced and good mystery. \n One that struck me was how similar Poirot is to Sherlock. They are both detectives, have a ex-military sidekick who is telling the story, and solve mysteries using their superior wit. Poirot seems like a French Sherlock. I'm curious if he was inspired by Sherlock.", 'date_added': 'Mon Jul 24 02:48:17 -0700 2017', 'date_updated': 'Sun Jul 30 09:28:03 -0700 2017', 'read_at': 'Tue Jul 25 00:00:00 -0700 2017', 'started_at': 'Mon Jul 24 00:00:00 -0700 2017', 'n_votes': 6, 'n_comments': 0}, {'user_id': '8842281e1d1347389f2ab93d60773d4d', 'book_id': '28684704', 'review_id': '2ede853b14dc4583f96cf5d120af636f', 'rating': 3, 'review_text': 'A

We have loaded our dataset, now the goal is to change its format into a dataframe. In this way each field of the document became a column.

In [None]:
df_reviews = pd.DataFrame(data_list)

In [None]:
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1849236 entries, 0 to 1849235
Data columns (total 11 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   user_id       object
 1   book_id       object
 2   review_id     object
 3   rating        int64 
 4   review_text   object
 5   date_added    object
 6   date_updated  object
 7   read_at       object
 8   started_at    object
 9   n_votes       int64 
 10  n_comments    int64 
dtypes: int64(3), object(8)
memory usage: 155.2+ MB


From df.info() we notice that our dataset has 1849236 documents. Now in order to better understand the structure we will perform some data exploration.

In [None]:
df_reviews.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,6392944,5e212a62bced17b4dbe41150e5bb9037,3,I haven't read a fun mystery book in a while a...,Mon Jul 24 02:48:17 -0700 2017,Sun Jul 30 09:28:03 -0700 2017,Tue Jul 25 00:00:00 -0700 2017,Mon Jul 24 00:00:00 -0700 2017,6,0
1,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,0
2,8842281e1d1347389f2ab93d60773d4d,32283133,8e4d61801907e591018bdc3442a9cf2b,0,http://www.telegraph.co.uk/culture/10...,Tue Nov 01 11:09:18 -0700 2016,Tue Nov 01 11:09:44 -0700 2016,,,9,0
3,8842281e1d1347389f2ab93d60773d4d,17860739,022bb6daffa49adc27f6b20b6ebeb37d,4,An amazing and unique creation: JJ Abrams and ...,Wed Mar 26 13:51:30 -0700 2014,Tue Sep 23 01:44:36 -0700 2014,Sun Sep 21 00:00:00 -0700 2014,Sat Jul 26 00:00:00 -0700 2014,7,0
4,8842281e1d1347389f2ab93d60773d4d,8694005,0e317947e1fd341f573192111bb2921d,3,The Name of the Rose is a thrilling Dan Brown-...,Wed Sep 08 01:22:27 -0700 2010,Wed Dec 14 12:30:43 -0800 2016,Mon Aug 10 00:00:00 -0700 2015,Mon Jul 20 00:00:00 -0700 2015,17,6


In [None]:
# descriptive statistics for our data
df_reviews.describe()

Unnamed: 0,rating,n_votes,n_comments
count,1849236.0,1849236.0,1849236.0
mean,3.683981,1.094315,0.2695005
std,1.208838,6.319025,2.042459
min,0.0,-3.0,-1.0
25%,3.0,0.0,0.0
50%,4.0,0.0,0.0
75%,5.0,1.0,0.0
max,5.0,904.0,348.0


In [None]:
#are there missing values?
df_reviews.isna().sum()

user_id         0
book_id         0
review_id       0
rating          0
review_text     0
date_added      0
date_updated    0
read_at         0
started_at      0
n_votes         0
n_comments      0
dtype: int64

In [None]:
#are there missing strings?
df_reviews[df_reviews['review_text']=='']

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
18459,fea9b0a54f57f9be780c4e8404b388fb,470693,3bf140bf5aa19174db227f970e9f3fdf,2,,Sun Mar 10 16:49:45 -0700 2013,Thu Mar 14 16:24:11 -0700 2013,Thu Mar 14 16:24:11 -0700 2013,Sun Mar 10 00:00:00 -0800 2013,2,0
23336,0223a9592bfaf2edce5a348a293c254b,25010773,29f1ba2e61515f9cece61b007b42f272,0,,Wed Jul 01 18:17:08 -0700 2015,Wed Jul 01 19:03:32 -0700 2015,,,7,0
26937,acea1d6a9e2df9c268fc65d6816909df,13573236,4bf0d1942bcf8e21edae732fef4608f2,3,,Sun Nov 18 14:16:11 -0800 2012,Sun Dec 16 02:27:29 -0800 2012,Thu Nov 29 00:00:00 -0800 2012,,0,0
29742,82806811a06d3f90defce2254845533d,22557272,2ee1326a399e7bc363804a5a97043898,4,,Wed Sep 30 10:16:13 -0700 2015,Sat Oct 03 12:42:15 -0700 2015,Fri Oct 02 00:00:00 -0700 2015,Fri Oct 02 00:00:00 -0700 2015,0,0
37574,e6d58522010659d7b1dba59eda9c9be6,18246727,ffe45791cb44946bb4e190fa9c2b8eb7,4,,Sun Aug 18 18:22:29 -0700 2013,Thu Sep 18 12:29:14 -0700 2014,Thu Sep 18 12:29:14 -0700 2014,Sat Jan 04 05:37:16 -0800 2014,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1799349,841a456926aa59d3f07ad8faa582837a,10770608,db403d78fda92e06089993cba54b3dcf,5,,Tue Jun 07 14:06:02 -0700 2016,Tue Jun 07 14:06:03 -0700 2016,Tue Jun 07 14:06:03 -0700 2016,,0,0
1811646,8e5fb9c2586f71e6021285b99921c43b,13359067,72b32ddd742fe5d637f91e73b303482e,0,,Sat Jan 24 09:45:32 -0800 2015,Wed Mar 04 16:59:41 -0800 2015,Sun Feb 15 00:00:00 -0800 2015,Sat Jan 24 00:00:00 -0800 2015,0,0
1823892,15373d9434d75dc2c4983f1e1a05259d,35184151,cec2c56779939df34d60b16919c44c87,0,,Thu Jul 06 16:24:18 -0700 2017,Thu Jul 06 16:25:57 -0700 2017,,,2,0
1826336,684c46ddacdac578bfadd4f3ca5872ad,830502,070e5f10eb75152f15898262486c30f1,5,,Thu Aug 10 03:24:50 -0700 2017,Fri Aug 25 04:19:17 -0700 2017,Fri Aug 25 04:19:17 -0700 2017,Thu Aug 10 00:00:00 -0700 2017,0,0


Yes; there are 384 empty texts. Since the fact that our goal is to classify reviews and find topic in them, we have to drop these rows.

Let's drop them.

In [None]:
df_reviews.shape

(1849236, 11)

In [None]:
df_reviews = df_reviews[df_reviews['review_text'] != '']
df_reviews.shape

(1848852, 11)

In [None]:
#let's check that the code work well
df_reviews[df_reviews['review_text']=='']


Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments


In [None]:
# these are some reviews that after pre processing steps in code 02 result in null values.
# In order to understand what happens, we inspect them
# we notice that they are numbers and likns that has been removed with the pre processing steps. We will handle them in future
print(f"{df_reviews[df_reviews['review_id'] == 'e28f880531a4f0adaaf954ac9b9f9d9f']['review_text']}\n")
print(f"{df_reviews[df_reviews['review_id'] == 'f727301de2280e84d2af3645bc5adb3e']['review_text']}\n")
print(f"{df_reviews[df_reviews['review_id'] == '5e019779095c46a52971b2777d9313c4']['review_text']}")


138300    8.5
Name: review_text, dtype: object

1218217    ----1.5----
Name: review_text, dtype: object

1126528    http://www.girllostinabook.com/2013/1...
Name: review_text, dtype: object


Now we don't have empty strings.
Let's go on with data exploration.


How many times a given book has been read?
We count the rating associated to each book_id since the fact that each time a user gives a review means that he/she has read the book.

In [None]:
df_reviews.groupby(df_reviews['book_id']).count()['user_id'].sort_values(ascending=False).head(10)

book_id
22557272    13401
8442457      9310
2429135      6701
17212231     4875
16160797     4539
19486412     4397
968          4333
5886881      4193
19288043     3636
5060378      3544
Name: user_id, dtype: int64

In [None]:
df_reviews.groupby(['book_id','rating']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,review_id,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
book_id,rating,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10000063,0,3,3,3,3,3,3,3,3,3
10000063,1,2,2,2,2,2,2,2,2,2
10000063,2,4,4,4,4,4,4,4,4,4
10000063,3,23,23,23,23,23,23,23,23,23
10000063,4,24,24,24,24,24,24,24,24,24
...,...,...,...,...,...,...,...,...,...,...
9999690,4,2,2,2,2,2,2,2,2,2
9999742,2,1,1,1,1,1,1,1,1,1
9999742,3,2,2,2,2,2,2,2,2,2
9999742,5,1,1,1,1,1,1,1,1,1


In [None]:
df_reviews[df_reviews['rating'] == 0]

print(df_reviews[df_reviews['review_id'] == '9387cd3a5bd8dc142ae019e4bc4229a8']['review_text'])

278    I don't think I've read this one! I might have...
Name: review_text, dtype: object


In [None]:
rating_count = df_reviews.groupby(df_reviews['rating']).count()['review_id']
rating_count

rating
0     57624
1     51691
2    142573
3    420477
4    669322
5    507165
Name: review_id, dtype: int64

Since we have a huge dataset, we try to calculate the sample size that could be results in a significative sample.
So we try to calculate the 20% of each class

In [None]:
percentage = []
for i, count in rating_count.items(): # both i and count because otherwise it takes only the index
    percentage.append(round(count * 0.1))
    print(f'The percentage of rating {i} is: {percentage[-1]}') #percentage[-1] to calculate only the last element of the iterated list

The percentage of rating 0 is: 5762
The percentage of rating 1 is: 5169
The percentage of rating 2 is: 14257
The percentage of rating 3 is: 42048
The percentage of rating 4 is: 66932
The percentage of rating 5 is: 50716


From the cell above we now sample each strata of rating column according to the percentage.

In [None]:
df_reviews.sort_values(by='rating', ascending=True, inplace=True)

In [None]:
df_reviews.reset_index(inplace=True)
df_reviews.head()

Unnamed: 0,index,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,1411257,0a7f6d4147838f13b94d895dd7115fb0,33380132,6651103ac3a8460292183b0991f23a72,0,I've missed reading about Madison and Fab! The...,Mon Jan 30 06:35:47 -0800 2017,Wed Feb 22 06:42:16 -0800 2017,Wed Feb 22 06:42:16 -0800 2017,Wed Feb 01 00:00:00 -0800 2017,2,0
1,163364,5b58dd727b90f611a739f6cbc4340b1f,18281481,2cecf67ddb2ef4c512fd6e7e83a2c631,0,"A Tap at the Window, by Linwood Barclay, a-min...",Mon Aug 19 03:16:15 -0700 2013,Mon Aug 19 03:19:20 -0700 2013,Thu Aug 01 00:00:00 -0700 2013,,1,0
2,163363,5b58dd727b90f611a739f6cbc4340b1f,16191493,92385a244e66c8b4a38c475410cb4221,0,"Paying the Piper, by Simon Wood, a-minus, Narr...",Mon Aug 19 03:22:23 -0700 2013,Mon Aug 19 03:25:27 -0700 2013,Thu Aug 01 00:00:00 -0700 2013,,2,0
3,163362,5b58dd727b90f611a739f6cbc4340b1f,16089120,b472d0a9bd3f538f0fa0b6ebbba6ffc8,0,"Justice for Sara, by Erica Spindler, b-plus, N...",Mon Aug 19 03:28:09 -0700 2013,Mon Aug 19 03:31:06 -0700 2013,Thu Aug 01 00:00:00 -0700 2013,,0,0
4,163361,5b58dd727b90f611a739f6cbc4340b1f,16236388,b4dd958fc086c23e6fc4e4d884a4e434,0,"Terminated, by Simon Wood, b-plus, Narrated by...",Mon Aug 19 03:34:28 -0700 2013,Mon Aug 19 03:37:02 -0700 2013,Thu Aug 01 00:00:00 -0700 2013,,0,0


We notice that there are some positive reviews (great fun, OMG great) with rating 0 that in an ordered scale means the worst judgment. So we decided to better inspect this class.

In [None]:
df_reviews[df_reviews["rating"] == 0].iloc[32180:32190]

Unnamed: 0,index,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
32180,363624,8d6c673c2d40b4c04ef6b2c53f06198e,2198274,5eae3fb4a52a2ab6d45bf51f3eabccbe,0,Never really could get in the book. Just not v...,Mon Nov 24 13:25:45 -0800 2008,Tue Nov 25 13:23:11 -0800 2008,Fri Aug 01 00:00:00 -0700 2008,,0,0
32181,972001,6b0619330c490a5c841067b02c63f0c7,32143,6d3e6bbc86191824177425c86987333f,0,Loved it! Different from the first but still a...,Wed Mar 01 04:26:21 -0800 2017,Mon Mar 27 13:14:29 -0700 2017,Thu Mar 16 00:00:00 -0700 2017,Wed Mar 01 00:00:00 -0800 2017,0,0
32182,971997,ecfbc8ad98239d5cfb575575e207d672,259538,01338cada554e3d8968d7244455899e4,0,"Spelbinding, great typical Robin Cook",Wed May 19 01:16:39 -0700 2010,Thu May 20 03:50:28 -0700 2010,,Wed May 19 00:00:00 -0700 2010,0,0
32183,363642,3c0d948fa60ab08d56eb2e835e945211,985883,fd9d32d6600c37f5b0fd15565ac8884c,0,"Hsnan..mzlt f~ bdyth,wlkn~ kl`d@ l stTy` lSmt ...",Sun Oct 24 06:33:36 -0700 2010,Tue Nov 02 06:14:31 -0700 2010,Tue Nov 02 00:00:00 -0700 2010,Sun Oct 24 00:00:00 -0700 2010,0,0
32184,971961,4474cd0f462e49bdac2d31365539d11d,30530244,0eb004c4af8ac822748d679792c8dfe7,0,This was a really interesting little glimpse i...,Thu Sep 08 11:02:11 -0700 2016,Thu Oct 06 05:47:41 -0700 2016,Thu Oct 06 07:49:40 -0700 2016,Mon Oct 03 00:00:00 -0700 2016,2,0
32185,338801,64d8babd0e660770b9e6082ff2077874,380975,fc909bbe5b8228849cc4faef8a062b36,0,When I was in second grade I was in my Nancy D...,Sat Apr 06 09:46:52 -0700 2013,Thu Apr 11 08:57:39 -0700 2013,,Sat Apr 06 00:00:00 -0700 2013,0,0
32186,971927,3474b21aecce1d516f6ce04ee66eda41,6854,d0bfc3c5945ef3cf7f226650773de66a,0,Just amazing.,Tue Dec 02 15:51:25 -0800 2014,Tue Dec 02 15:51:25 -0800 2014,,,0,0
32187,971926,3474b21aecce1d516f6ce04ee66eda41,6422,b24483b6e48d9fc357709ae3bd84ef92,0,I just want to be stephanie plum.,Tue Dec 02 15:52:51 -0800 2014,Tue Dec 02 15:52:51 -0800 2014,,,0,0
32188,971918,3474b21aecce1d516f6ce04ee66eda41,21480930,746a27b9a6512ad424b010a529e40d5f,0,DNFed at 40%. I don't know why I just couldn't...,Sun Aug 23 00:40:08 -0700 2015,Tue Feb 23 02:12:11 -0800 2016,Tue Feb 23 02:12:11 -0800 2016,Fri Feb 19 00:00:00 -0800 2016,1,0
32189,338705,ce71763dac4e11c086c5019b841809e6,31455693,fd45172da8069f4026d18dd2ba052a74,0,i know I say this after every book in a series...,Tue Aug 16 12:00:28 -0700 2016,Thu Aug 18 11:03:05 -0700 2016,Wed Aug 17 00:00:00 -0700 2016,Tue Aug 16 00:00:00 -0700 2016,0,0


In [None]:
print(f'{df_reviews[df_reviews["rating"] == 0].iloc[32181]["review_text"]}\n')
print(f'{df_reviews[df_reviews["rating"] == 0].iloc[32184]["review_text"]}\n')
print(f'{df_reviews[df_reviews["rating"] == 0].iloc[32186]["review_text"]}\n')
print(f'{df_reviews[df_reviews["rating"] == 0].iloc[32187]["review_text"]}\n')

Loved it! Different from the first but still a good read!

This was a really interesting little glimpse into futuristic cooking and the moral dilemmas that could arise if kitchen robots were effectively given the power to make health decisions for us ... maybe even to point of euthanasia. 
 However, while I found the trial and the technology descriptions fascinating, the juror protagonist Julio left A LOT to be desired. He was simaltaneously quite one dimensional and also a misogynist. I kept wishing we could have the perspective of literally ANY of the other jurors but him.

Just amazing.

I just want to be stephanie plum.



From the cell above we can understand that maybe 0 is the default rating that is assigned to a reviews automatically when the user doesn't express his/her opinion. So we decided to drop this class because we don't know what other label to assign to the reviews since it's a subjective aspect of the user.

In [None]:
df_reviews = df_reviews[df_reviews["rating"] != 0]
df_reviews.shape

(1791228, 12)

We remove the percentage relative to the 0 class

In [None]:
percentage.pop(0)
percentage

[5169, 14257, 42048, 66932, 50716]

In [None]:
percentage

[5169, 14257, 42048, 66932, 50716]

Now we have ordered the dataset according to the rating class. Now the goal is to sample each strata according to the percentage above.

In [None]:
import pandas as pd
import numpy as np

# Assuming df_reviews is your DataFrame
# percentage_rows is the list that contains the total rows to be extracted for each rating value.

# Setting seed
seed_value = 42
np.random.seed(seed_value)

# Initialize an index to track the extracted rows
start_index = 0


subset_list = []

# Loop to extract rows for each rating
for rating, num_rows in zip(df_reviews['rating'].unique(), percentage):
    # Extract the specified number of rows for the current rating
    df_reviews_sample_by_rating = df_reviews[df_reviews['rating'] == rating].sample(n=num_rows, random_state=seed_value)

    # Add the subset to the list
    subset_list.append(df_reviews_sample_by_rating)


    print(f"Subset for rating {rating}:")
    print(df_reviews_sample_by_rating.shape)
    # print("\n")

# Concatenating all subsets
df_reviews_sample = pd.concat(subset_list, ignore_index=True)

# Display the shape of the final sampled DataFrame
print(df_reviews_sample.shape)

Subset for rating 1:
(5169, 12)
Subset for rating 2:
(14257, 12)
Subset for rating 3:
(42048, 12)
Subset for rating 4:
(66932, 12)
Subset for rating 5:
(50716, 12)
(179122, 12)


In [None]:
df_reviews_sample.head()

Unnamed: 0,index,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,1367449,c8d45b7d0f96c3ff949bb45100abcba2,18045891,6b3ea87e4a1c4ad1fdb0e31451f3e1bc,1,Incredibly disturbing and creepy. Ick. Not my ...,Sun Jan 17 19:48:26 -0800 2016,Tue Jan 26 21:09:06 -0800 2016,Tue Jan 26 00:00:00 -0800 2016,Sun Jan 17 00:00:00 -0800 2016,0,0
1,404951,58d35dd921ef25e4016df56ee28c4334,298638,5aab81e469115a790fd9a24afdfedde4,1,Mostly boring,Wed Jan 28 20:08:37 -0800 2015,Wed Jan 28 20:09:09 -0800 2015,Tue Jun 01 00:00:00 -0700 1999,,0,0
2,1595389,c2a085c9f5d627f105dae8e0d1692ced,33151805,8e62ae0815512dfb45bd33d8dd904750,1,Zero. \n That's the best word I can use to des...,Tue Feb 07 11:40:20 -0800 2017,Fri May 26 04:51:33 -0700 2017,Fri May 26 04:51:33 -0700 2017,Fri May 19 00:00:00 -0700 2017,1,0
3,277184,4d5935f7e37c3bcd4d3d8586f30af15d,16316,68c4b9d83013d3dd8020e79430cc9ce1,1,This was the worst book in the world and it fe...,Sun May 28 00:15:49 -0700 2017,Mon May 29 09:25:39 -0700 2017,Mon May 29 09:25:39 -0700 2017,Sun May 28 00:15:50 -0700 2017,1,1
4,1549257,47e371d09b073e720ab6b856a4eeabfd,18492486,1ec72e33fd6ccf60dac787efed6bc658,1,I really wanted to like this book. I love dine...,Sat Jun 18 12:48:08 -0700 2016,Sat Jun 18 12:52:26 -0700 2016,Thu Jun 16 00:00:00 -0700 2016,,0,0


Let's check the non english reviews  and drop them by our sample.
This is done in order to help the algorithm on working on the same language.


Then we will drop the column not useful for our purpose.

In [None]:
!pip install langid
import pandas as pd
import langid

# Downloading language model for langid
#langid.download()


# Adding language colum
df_reviews_sample['language'] = df_reviews_sample['review_text'].apply(lambda x: langid.classify(x)[0])

# Filtering out non english reviews
non_english_reviews = df_reviews_sample[df_reviews_sample['language'] != 'en']

# Printing non english reviews
print(non_english_reviews.shape)


Collecting langid
  Downloading langid-1.1.6.tar.gz (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langid
  Building wheel for langid (setup.py) ... [?25l[?25hdone
  Created wheel for langid: filename=langid-1.1.6-py3-none-any.whl size=1941172 sha256=5b9b4513f382c2c876da112022bb2e798f4e0bb3757280252a228fe7f3c59fd6
  Stored in directory: /root/.cache/pip/wheels/23/c8/c6/eed80894918490a175677414d40bd7c851413bbe03d4856c3c
Successfully built langid
Installing collected packages: langid
Successfully installed langid-1.1.6
(12223, 13)


In [None]:
non_english_reviews.groupby(['language', 'rating']).size().reset_index(name='count')


Unnamed: 0,language,rating,count
0,af,2,4
1,af,3,21
2,af,4,47
3,af,5,36
4,an,1,3
...,...,...,...
240,zu,1,2
241,zu,2,2
242,zu,3,19
243,zu,4,28


In [None]:
df_reviews_sample = df_reviews_sample[df_reviews_sample['language'] == 'en']
print(df_reviews_sample.shape)

(166899, 13)


In [None]:
df_reviews_sample.drop(columns=[
                       'index',
                       'date_added',
                       'date_updated',
                       'read_at',
                       'started_at',
                        'n_votes',
                        'n_comments',
                       'language'], inplace=True)
df_reviews_sample.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text
0,c8d45b7d0f96c3ff949bb45100abcba2,18045891,6b3ea87e4a1c4ad1fdb0e31451f3e1bc,1,Incredibly disturbing and creepy. Ick. Not my ...
1,58d35dd921ef25e4016df56ee28c4334,298638,5aab81e469115a790fd9a24afdfedde4,1,Mostly boring
2,c2a085c9f5d627f105dae8e0d1692ced,33151805,8e62ae0815512dfb45bd33d8dd904750,1,Zero. \n That's the best word I can use to des...
3,4d5935f7e37c3bcd4d3d8586f30af15d,16316,68c4b9d83013d3dd8020e79430cc9ce1,1,This was the worst book in the world and it fe...
4,47e371d09b073e720ab6b856a4eeabfd,18492486,1ec72e33fd6ccf60dac787efed6bc658,1,I really wanted to like this book. I love dine...


Now in order to compute text classification, we have to create a label column that resumes the rating into 2 classes.

- 1-3: negative
- 4-5: positive

*3 is negative in order to avoid class imbalance problem, and because we think that positive classes are only the ones with highest score, so everything that isn't in this class is negative.*


In [None]:
my_dict= {1:'negative',
              2:'negative',
              3:'negative',
              4:'positive',
              5:'positive'}

In [None]:
df_reviews_sample['label']= df_reviews_sample['rating'].map(my_dict)
df_reviews_sample.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,label
0,c8d45b7d0f96c3ff949bb45100abcba2,18045891,6b3ea87e4a1c4ad1fdb0e31451f3e1bc,1,Incredibly disturbing and creepy. Ick. Not my ...,negative
1,58d35dd921ef25e4016df56ee28c4334,298638,5aab81e469115a790fd9a24afdfedde4,1,Mostly boring,negative
2,c2a085c9f5d627f105dae8e0d1692ced,33151805,8e62ae0815512dfb45bd33d8dd904750,1,Zero. \n That's the best word I can use to des...,negative
3,4d5935f7e37c3bcd4d3d8586f30af15d,16316,68c4b9d83013d3dd8020e79430cc9ce1,1,This was the worst book in the world and it fe...,negative
4,47e371d09b073e720ab6b856a4eeabfd,18492486,1ec72e33fd6ccf60dac787efed6bc658,1,I really wanted to like this book. I love dine...,negative


In [None]:
df_reviews_sample.reset_index(inplace = True)
df_reviews_sample.head()

Unnamed: 0,index,user_id,book_id,review_id,rating,review_text,label
0,0,c8d45b7d0f96c3ff949bb45100abcba2,18045891,6b3ea87e4a1c4ad1fdb0e31451f3e1bc,1,Incredibly disturbing and creepy. Ick. Not my ...,negative
1,1,58d35dd921ef25e4016df56ee28c4334,298638,5aab81e469115a790fd9a24afdfedde4,1,Mostly boring,negative
2,2,c2a085c9f5d627f105dae8e0d1692ced,33151805,8e62ae0815512dfb45bd33d8dd904750,1,Zero. \n That's the best word I can use to des...,negative
3,3,4d5935f7e37c3bcd4d3d8586f30af15d,16316,68c4b9d83013d3dd8020e79430cc9ce1,1,This was the worst book in the world and it fe...,negative
4,4,47e371d09b073e720ab6b856a4eeabfd,18492486,1ec72e33fd6ccf60dac787efed6bc658,1,I really wanted to like this book. I love dine...,negative


In [None]:
df_reviews_sample.drop(columns =['index'], inplace = True)
df_reviews_sample.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,label
0,c8d45b7d0f96c3ff949bb45100abcba2,18045891,6b3ea87e4a1c4ad1fdb0e31451f3e1bc,1,Incredibly disturbing and creepy. Ick. Not my ...,negative
1,58d35dd921ef25e4016df56ee28c4334,298638,5aab81e469115a790fd9a24afdfedde4,1,Mostly boring,negative
2,c2a085c9f5d627f105dae8e0d1692ced,33151805,8e62ae0815512dfb45bd33d8dd904750,1,Zero. \n That's the best word I can use to des...,negative
3,4d5935f7e37c3bcd4d3d8586f30af15d,16316,68c4b9d83013d3dd8020e79430cc9ce1,1,This was the worst book in the world and it fe...,negative
4,47e371d09b073e720ab6b856a4eeabfd,18492486,1ec72e33fd6ccf60dac787efed6bc658,1,I really wanted to like this book. I love dine...,negative


Looking at the third row we have the confrim about the intuition on the class 0

In [None]:
df_reviews_sample.to_csv('/content/gdrive/MyDrive/Text Mining/Progetto_Brambatti_Fracchia_Privitera/Data/df_reviews_sample.csv')