In [1]:
import json
import os
import sys

In [6]:
import pandas as pd
import numpy as np

In [2]:
sys.path.append("..")


In [23]:
from sklearn.model_selection import StratifiedKFold

## Amazon music dataset

In [33]:
dataset = []
with open("../data/amazon-digital-music-raw.json", 'r') as file:
    for line in file.readlines():
        dataset.append(json.loads(line))

In [34]:
dataset[2]

{'overall': 5.0,
 'verified': True,
 'reviewTime': '02 11, 2014',
 'reviewerID': 'A2VAMODP8M77NG',
 'asin': '3426958910',
 'style': {'Format:': ' Audio CD'},
 'reviewerName': 'JTGabq',
 'reviewText': 'It was great to hear the old stuff again and I like the new stuff too. I recommend it to any Slayer fan.',
 'summary': 'SLAYER!!!!!!!!!!!!!!!!!!!!!',
 'unixReviewTime': 1392076800}

In [35]:
dataset_pd = pd.DataFrame(dataset)

In [36]:
dataset_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169781 entries, 0 to 169780
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         169781 non-null  float64
 1   vote            7611 non-null    object 
 2   verified        169781 non-null  bool   
 3   reviewTime      169781 non-null  object 
 4   reviewerID      169781 non-null  object 
 5   asin            169781 non-null  object 
 6   style           157989 non-null  object 
 7   reviewerName    169776 non-null  object 
 8   reviewText      169623 non-null  object 
 9   summary         169745 non-null  object 
 10  unixReviewTime  169781 non-null  int64  
 11  image           182 non-null     object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 14.4+ MB


In [37]:
dataset_pd.isna().sum()

overall                0
vote              162170
verified               0
reviewTime             0
reviewerID             0
asin                   0
style              11792
reviewerName           5
reviewText           158
summary               36
unixReviewTime         0
image             169599
dtype: int64

In [38]:
dataset_pd = dataset_pd.dropna(subset=["reviewText"])
dataset_pd = dataset_pd.reset_index()

In [39]:
dataset_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169623 entries, 0 to 169622
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   index           169623 non-null  int64  
 1   overall         169623 non-null  float64
 2   vote            7611 non-null    object 
 3   verified        169623 non-null  bool   
 4   reviewTime      169623 non-null  object 
 5   reviewerID      169623 non-null  object 
 6   asin            169623 non-null  object 
 7   style           157835 non-null  object 
 8   reviewerName    169618 non-null  object 
 9   reviewText      169623 non-null  object 
 10  summary         169611 non-null  object 
 11  unixReviewTime  169623 non-null  int64  
 12  image           181 non-null     object 
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 15.7+ MB


In [40]:
dataset_pd.head()

Unnamed: 0,index,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,0,5.0,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,{'Format:': ' Audio CD'},Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600,
1,1,5.0,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,{'Format:': ' Audio CD'},Ad,bien,Five Stars,1412985600,
2,2,5.0,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,{'Format:': ' Audio CD'},JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1392076800,
3,3,4.0,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,{'Format:': ' Audio CD'},john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1386374400,
4,4,5.0,,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,{'Format:': ' Audio CD'},melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1465689600,


In [41]:
all_votes = dataset_pd['overall'].unique()

for v in all_votes:
    print(f"percentage of votes for rating {v} : {round(len(dataset_pd[dataset_pd['overall'] == v]) / len(dataset_pd)* 100, 3) } ")

percentage of votes for rating 5.0 : 79.992 
percentage of votes for rating 4.0 : 13.643 
percentage of votes for rating 2.0 : 1.068 
percentage of votes for rating 3.0 : 4.004 
percentage of votes for rating 1.0 : 1.292 


### Stratifying the samples to get smaller datasets

In [42]:
stratifier = StratifiedKFold(n_splits= 10 )

In [43]:
# Create a new column in the DataFrame to store the fold number for each row
dataset_pd['fold'] = -1

# Perform stratified sampling and assign fold numbers to each row
for fold_number, (train_indices, test_indices) in enumerate(stratifier.split(dataset_pd, dataset_pd["overall"])):
    dataset_pd.loc[test_indices, 'fold'] = fold_number

In [53]:
for fold in dataset_pd['fold'].unique():
    fold_df:pd.DataFrame = dataset_pd[dataset_pd['fold'] == fold]
    
    fold_df.to_json(f"../data/amazon-digital-music-raw-fold-{fold}.json",orient="records", lines = True, )

In [54]:
fold1 = pd.read_json("../data/amazon-digital-music-raw-fold-1.json", lines=True)

In [55]:
fold1

Unnamed: 0,index,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image,fold
0,16585,5,,True,"11 24, 2013",AVP1NL6GYMVR,B00123M8ZY,,RDSWY4,The first time I ever heard this song was as a...,Must Have Classic,1385251200,,1
1,16586,5,,True,"08 21, 2013",A3NVWWDF437JBD,B00123M8ZY,,Obsessive Cat Disorder,This is just a gorgeous ballad. Was used in Pl...,Beautiful,1377043200,,1
2,16587,5,,True,"08 12, 2013",A1NADGMAR6GQW1,B00123M8ZY,,Coolbob425,At the age of 66 and widowed I have been going...,CD for Girlfriend,1376265600,,1
3,16588,5,,True,"08 9, 2013",A21X3U0ELWDPPQ,B00123M8ZY,,Don Kennedy,This is a rather old song (1950s) which has be...,First Time,1376006400,,1
4,16589,5,,True,"03 26, 2015",A32VFN0E3238Z0,B00123LUXA,{'Format:': ' MP3 Music'},Jerry Colvard,love oldies,Five Stars,1427328000,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16958,54526,2,,True,"12 24, 2015",A2J1U7GCS0KL7A,B001E9ULJ6,{'Format:': ' MP3 Music'},Cara,Not my favorite.,Two Stars,1450915200,,1
16959,54532,2,,True,"04 5, 2015",A2JMRUN0QULNLI,B001E9ULJ6,{'Format:': ' MP3 Music'},Indy Mog,It's alright,Two Stars,1428192000,,1
16960,54812,2,,True,"11 1, 2013",A3RVBA621G3W6I,B001ESDIQ0,,Joseph McGuinness (Ray Ray&#039;s Records),Tha Carter III Volume 1 Vinyl only has 8 songs...,Tha Carter III Volume 1 Vinyl,1383264000,,1
16961,54839,2,,False,"06 10, 2008",A3PCTD8QM1BIXI,B001ESDIQ0,,Derrick Dunn,"After numerous delays and lots of hype, Lil Wa...",Over Hyped And A Waste Of Time,1213056000,,1


In [56]:
all_votes = fold1['overall'].unique()

for v in all_votes:
    print(f"percentage of votes for rating {v} : {round(len(fold1[fold1['overall'] == v]) / len(fold1)* 100, 3) } ")

percentage of votes for rating 5 : 79.992 
percentage of votes for rating 4 : 13.641 
percentage of votes for rating 3 : 4.003 
percentage of votes for rating 1 : 1.297 
percentage of votes for rating 2 : 1.067 


## News categories dataset