In [1]:
import json
import os
import sys

In [2]:
import pandas as pd
import numpy as np

In [3]:
sys.path.append("..")


In [4]:
from sklearn.model_selection import StratifiedKFold

## Amazon music dataset

In [5]:
dataset = []
with open("../data/amazon-digital-music-raw.json", 'r') as file:
    for line in file.readlines():
        dataset.append(json.loads(line))

In [6]:
dataset[2]

{'overall': 5.0,
 'verified': True,
 'reviewTime': '02 11, 2014',
 'reviewerID': 'A2VAMODP8M77NG',
 'asin': '3426958910',
 'style': {'Format:': ' Audio CD'},
 'reviewerName': 'JTGabq',
 'reviewText': 'It was great to hear the old stuff again and I like the new stuff too. I recommend it to any Slayer fan.',
 'summary': 'SLAYER!!!!!!!!!!!!!!!!!!!!!',
 'unixReviewTime': 1392076800}

In [7]:
dataset_pd = pd.DataFrame(dataset)

In [8]:
dataset_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169781 entries, 0 to 169780
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         169781 non-null  float64
 1   vote            7611 non-null    object 
 2   verified        169781 non-null  bool   
 3   reviewTime      169781 non-null  object 
 4   reviewerID      169781 non-null  object 
 5   asin            169781 non-null  object 
 6   style           157989 non-null  object 
 7   reviewerName    169776 non-null  object 
 8   reviewText      169623 non-null  object 
 9   summary         169745 non-null  object 
 10  unixReviewTime  169781 non-null  int64  
 11  image           182 non-null     object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 14.4+ MB


In [9]:
dataset_pd.isna().sum()

overall                0
vote              162170
verified               0
reviewTime             0
reviewerID             0
asin                   0
style              11792
reviewerName           5
reviewText           158
summary               36
unixReviewTime         0
image             169599
dtype: int64

In [10]:
dataset_pd = dataset_pd.dropna(subset=["reviewText"])
dataset_pd = dataset_pd.reset_index()

In [11]:
dataset_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169623 entries, 0 to 169622
Data columns (total 13 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   index           169623 non-null  int64  
 1   overall         169623 non-null  float64
 2   vote            7611 non-null    object 
 3   verified        169623 non-null  bool   
 4   reviewTime      169623 non-null  object 
 5   reviewerID      169623 non-null  object 
 6   asin            169623 non-null  object 
 7   style           157835 non-null  object 
 8   reviewerName    169618 non-null  object 
 9   reviewText      169623 non-null  object 
 10  summary         169611 non-null  object 
 11  unixReviewTime  169623 non-null  int64  
 12  image           181 non-null     object 
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 15.7+ MB


In [12]:
dataset_pd.head()

Unnamed: 0,index,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,0,5.0,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,{'Format:': ' Audio CD'},Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600,
1,1,5.0,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,{'Format:': ' Audio CD'},Ad,bien,Five Stars,1412985600,
2,2,5.0,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,{'Format:': ' Audio CD'},JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1392076800,
3,3,4.0,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,{'Format:': ' Audio CD'},john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1386374400,
4,4,5.0,,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,{'Format:': ' Audio CD'},melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1465689600,


In [13]:
all_votes = dataset_pd['overall'].unique()

for v in all_votes:
    print(f"percentage of votes for rating {v} : {round(len(dataset_pd[dataset_pd['overall'] == v]) / len(dataset_pd)* 100, 3) } ")

percentage of votes for rating 5.0 : 79.992 
percentage of votes for rating 4.0 : 13.643 
percentage of votes for rating 2.0 : 1.068 
percentage of votes for rating 3.0 : 4.004 
percentage of votes for rating 1.0 : 1.292 


### Stratifying the samples to get smaller datasets

In [30]:
# Load in the training data for stratification

dataset_pd = pd.read_json("../data/amazon_training.json")

In [31]:
dataset_pd

Unnamed: 0,text,classification
0,This is not my genre of music. So my review i...,3
1,"Well done Mary I love this song, I have always...",5
2,ok,5
3,Love this song.,5
4,Great music! Loved IT! Thanks A Lot!,5
...,...,...
152656,"Don't get me wrong, I really like Tracy Chapma...",3
152657,I liked this song.,4
152658,People like it in the clubs in the USA,5
152659,I love this song but it would not play in my p...,5


In [32]:
stratifier = StratifiedKFold(n_splits= 10 )

In [33]:
# Create a new column in the DataFrame to store the fold number for each row
dataset_pd['fold'] = -1

# Perform stratified sampling and assign fold numbers to each row
for fold_number, (train_indices, test_indices) in enumerate(stratifier.split(dataset_pd, dataset_pd["classification"])):
    dataset_pd.loc[test_indices, 'fold'] = fold_number

In [34]:
for fold in dataset_pd['fold'].unique():
    fold_df:pd.DataFrame = dataset_pd[dataset_pd['fold'] == fold]
    
    fold_df.to_json(f"../data/amazon-digital-music-raw-fold-{fold}.json",orient="records", lines = True, )

In [54]:
fold1 = pd.read_json("../data/amazon-digital-music-raw-fold-1.json", lines=True)

In [55]:
fold1

Unnamed: 0,index,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image,fold
0,16585,5,,True,"11 24, 2013",AVP1NL6GYMVR,B00123M8ZY,,RDSWY4,The first time I ever heard this song was as a...,Must Have Classic,1385251200,,1
1,16586,5,,True,"08 21, 2013",A3NVWWDF437JBD,B00123M8ZY,,Obsessive Cat Disorder,This is just a gorgeous ballad. Was used in Pl...,Beautiful,1377043200,,1
2,16587,5,,True,"08 12, 2013",A1NADGMAR6GQW1,B00123M8ZY,,Coolbob425,At the age of 66 and widowed I have been going...,CD for Girlfriend,1376265600,,1
3,16588,5,,True,"08 9, 2013",A21X3U0ELWDPPQ,B00123M8ZY,,Don Kennedy,This is a rather old song (1950s) which has be...,First Time,1376006400,,1
4,16589,5,,True,"03 26, 2015",A32VFN0E3238Z0,B00123LUXA,{'Format:': ' MP3 Music'},Jerry Colvard,love oldies,Five Stars,1427328000,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16958,54526,2,,True,"12 24, 2015",A2J1U7GCS0KL7A,B001E9ULJ6,{'Format:': ' MP3 Music'},Cara,Not my favorite.,Two Stars,1450915200,,1
16959,54532,2,,True,"04 5, 2015",A2JMRUN0QULNLI,B001E9ULJ6,{'Format:': ' MP3 Music'},Indy Mog,It's alright,Two Stars,1428192000,,1
16960,54812,2,,True,"11 1, 2013",A3RVBA621G3W6I,B001ESDIQ0,,Joseph McGuinness (Ray Ray&#039;s Records),Tha Carter III Volume 1 Vinyl only has 8 songs...,Tha Carter III Volume 1 Vinyl,1383264000,,1
16961,54839,2,,False,"06 10, 2008",A3PCTD8QM1BIXI,B001ESDIQ0,,Derrick Dunn,"After numerous delays and lots of hype, Lil Wa...",Over Hyped And A Waste Of Time,1213056000,,1


In [56]:
all_votes = fold1['overall'].unique()

for v in all_votes:
    print(f"percentage of votes for rating {v} : {round(len(fold1[fold1['overall'] == v]) / len(fold1)* 100, 3) } ")

percentage of votes for rating 5 : 79.992 
percentage of votes for rating 4 : 13.641 
percentage of votes for rating 3 : 4.003 
percentage of votes for rating 1 : 1.297 
percentage of votes for rating 2 : 1.067 


In [47]:
amazon_test_data = pd.read_json("../data/amazon_test.json")

In [50]:
amazon_test_data.to_json("../data/amazon_test.json",orient = "records", lines=True)

## News categories dataset

In [17]:
news_data = pd.read_json("../data/news-categories-raw.json", lines=True)

In [18]:
news_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   link               209527 non-null  object        
 1   headline           209527 non-null  object        
 2   category           209527 non-null  object        
 3   short_description  209527 non-null  object        
 4   authors            209527 non-null  object        
 5   date               209527 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 9.6+ MB


In [19]:
news_data.isna().sum()

link                 0
headline             0
category             0
short_description    0
authors              0
date                 0
dtype: int64

In [20]:
news_data.value_counts(subset=['category'], normalize=True, sort=True, ascending=False) *100

category      
POLITICS          16.991605
WELLNESS           8.564529
ENTERTAINMENT      8.286283
TRAVEL             4.724928
STYLE & BEAUTY     4.683883
PARENTING          4.195641
HEALTHY LIVING     3.194815
QUEER VOICES       3.029204
FOOD & DRINK       3.025863
BUSINESS           2.859775
COMEDY             2.577233
SPORTS             2.423077
BLACK VOICES       2.187308
HOME & LIVING      2.061787
PARENTS            1.887585
THE WORLDPOST      1.748701
WEDDINGS           1.743451
WOMEN              1.704792
CRIME              1.700020
IMPACT             1.662793
DIVORCE            1.635111
WORLD NEWS         1.574499
MEDIA              1.405070
WEIRD NEWS         1.325366
GREEN              1.251390
WORLDPOST          1.230868
RELIGION           1.229913
STYLE              1.075756
SCIENCE            1.052848
TECH               1.004167
TASTE              1.000348
MONEY              0.838078
ARTS               0.720194
ENVIRONMENT        0.689171
FIFTY              0.668649
GOOD 

In [35]:
dataset_pd = pd.read_json("../data/news_training.json")

In [37]:
stratifier = StratifiedKFold(n_splits= 10)

In [39]:
# Create a new column in the DataFrame to store the fold number for each row
dataset_pd['fold'] = -1

# Perform stratified sampling and assign fold numbers to each row
for fold_number, (train_indices, test_indices) in enumerate(stratifier.split(dataset_pd, dataset_pd["classification"])):
    dataset_pd.loc[test_indices, 'fold'] = fold_number

In [40]:
for fold in dataset_pd['fold'].unique():
    fold_df:pd.DataFrame = dataset_pd[dataset_pd['fold'] == fold]
    
    fold_df.to_json(f"../data/news-categories-raw-fold-{fold}.json",orient="records", lines = True, )

In [42]:
t = pd.read_json("../data/news-categories-raw-fold-3.json", lines = True)

In [45]:
t['classification'].value_counts(normalize=True) * 100

POLITICS          17.006045
WELLNESS           8.569307
ENTERTAINMENT      8.272351
TRAVEL             4.730088
STYLE & BEAUTY     4.698271
PARENTING          4.205112
HEALTHY LIVING     3.186976
FOOD & DRINK       3.043801
QUEER VOICES       3.027893
BUSINESS           2.842295
COMEDY             2.587761
SPORTS             2.428677
BLACK VOICES       2.179446
HOME & LIVING      2.052179
PARENTS            1.887793
THE WORLDPOST      1.749920
WEDDINGS           1.734012
CRIME              1.702195
WOMEN              1.686287
IMPACT             1.659773
DIVORCE            1.633259
WORLD NEWS         1.574928
MEDIA              1.394634
WEIRD NEWS         1.325697
GREEN              1.272669
RELIGION           1.235550
WORLDPOST          1.230247
STYLE              1.081769
SCIENCE            1.049952
TECH               1.002227
TASTE              0.996924
MONEY              0.827235
ARTS               0.710574
ENVIRONMENT        0.684060
GOOD NEWS          0.673454
FIFTY              0

In [46]:
t

Unnamed: 0,text,classification,fold
0,"Hostess Brands Inc., the makers of Twinkies, D...",MONEY,3
1,Just 14 percent of Americans say they are very...,MONEY,3
2,I have concluded that the productive discussio...,COLLEGE,3
3,"CHARLOTTESVILLE, Va. (AP) -- A young woman at ...",COLLEGE,3
4,With an understanding of some common mistakes ...,MONEY,3
...,...,...,...
18853,Do oil companies really care about vulnerable ...,LATINO VOICES,3
18854,Did you hear? That's the sound of the internet...,STYLE,3
18855,Kim Kardashian's second-time-around maternity ...,STYLE,3
18856,"Critics called the video, which toured the hur...",LATINO VOICES,3


In [51]:
news_test_data = pd.read_json("../data/news_test.json")

In [52]:
news_test_data.to_json("../data/news_test.json",orient = "records", lines=True)

## Resampling

In [12]:
from sklearn.utils import resample

In [48]:
amazon = pd.read_json("../data/amazon_training.json")

In [49]:
amazon


Unnamed: 0,text,classification
0,This is not my genre of music. So my review i...,3
1,"Well done Mary I love this song, I have always...",5
2,ok,5
3,Love this song.,5
4,Great music! Loved IT! Thanks A Lot!,5
...,...,...
152656,"Don't get me wrong, I really like Tracy Chapma...",3
152657,I liked this song.,4
152658,People like it in the clubs in the USA,5
152659,I love this song but it would not play in my p...,5


In [50]:
minor = amazon[amazon['classification'].isin([1,2,3])]

In [51]:
down_maj = resample(amazon[amazon['classification'].isin([5])], 
                    replace=True,      # Sample with replacement
                    n_samples=5000,  # Match the majority class size
                    random_state=42)

In [52]:
down_maj2 = resample(amazon[amazon['classification'].isin([4])], 
                    replace=True,      # Sample with replacement
                    n_samples=5000,  # Match the majority class size
                    random_state=42)

In [53]:
down_maj

Unnamed: 0,text,classification
152458,A classic! I don't buy music I don't love!,5
19778,Love this song...takes me back!,5
1047,Great,5
129585,The song says it all.,5
149859,This song brings back memories but it's still ...,5
...,...,...
30165,I can go on and review each song/track... But...,5
16034,Dance!,5
77099,Great! Great! Great! Great! Great! Great! Grea...,5
120801,I like this song,5


In [54]:
df_resamp_amazon = pd.concat([minor, down_maj, down_maj2])

In [55]:
df_resamp_amazon = df_resamp_amazon.sample(frac=1).reset_index()

In [56]:
df_resamp_amazon

Unnamed: 0,index,text,classification
0,111490,Does anyone even know the REAL song??? This is...,2
1,68485,Does it really matter what I think?,3
2,47149,great song,4
3,88067,This is a great song with a great message. Si...,5
4,18180,Maybe the beat is tired to some (it's reworked...,4
...,...,...,...
19759,132725,How can you go wrong with this song? Just make...,4
19760,21392,Great music!!!!!,5
19761,78028,It is a good song,3
19762,115399,its a cool album and James taylor is a Pro...,3


In [57]:
df_resamp_amazon.to_json("../data/amazon_resample_train.json",orient = "records", lines=True)

In [21]:
news = pd.read_json("../data/news_training.json")

In [23]:
news.classification.value_counts()

POLITICS          32076
WELLNESS          16161
ENTERTAINMENT     15602
TRAVEL             8919
STYLE & BEAUTY     8855
PARENTING          7921
HEALTHY LIVING     6014
FOOD & DRINK       5734
QUEER VOICES       5711
BUSINESS           5361
COMEDY             4877
SPORTS             4571
BLACK VOICES       4115
HOME & LIVING      3870
PARENTS            3558
THE WORLDPOST      3307
WEDDINGS           3268
CRIME              3207
WOMEN              3187
IMPACT             3137
DIVORCE            3085
WORLD NEWS         2963
MEDIA              2627
WEIRD NEWS         2497
GREEN              2393
RELIGION           2334
WORLDPOST          2315
STYLE              2045
SCIENCE            1982
TECH               1889
TASTE              1883
MONEY              1561
ARTS               1342
ENVIRONMENT        1297
GOOD NEWS          1273
FIFTY              1253
U.S. NEWS          1231
ARTS & CULTURE     1208
COLLEGE            1037
LATINO VOICES      1023
CULTURE & ARTS      960
EDUCATION       

In [35]:
maj_cols = ["POLITICS","WELLNESS","ENTERTAINMENT","TRAVEL", "STYLE & BEAUTY", "PARENTING"]

In [36]:
maj_df = pd.DataFrame()

In [37]:
for col in maj_cols:
    temp = news[news['classification'] == col]
    temp = temp.sample(n = 7500)
    maj_df = pd.concat([maj_df, temp])

In [38]:
maj_df

Unnamed: 0,text,classification
44457,Through the exposure of Trump and the GOP's am...,POLITICS
164712,These survivors don't want their experiences t...,POLITICS
77828,"""Wanted to avoid using Failing Times New Roman...",POLITICS
168450,"Give us a majority, and we'll show you what th...",POLITICS
19674,Donald Trump is just keeping with a long tradi...,POLITICS
...,...,...
66394,,ENTERTAINMENT
140878,,ENTERTAINMENT
89786,“The Batman” is still going strong three weeks...,ENTERTAINMENT
140598,*Raises eyebrows*,ENTERTAINMENT


In [39]:
min_cols = list(set(news.classification.unique()) - set(maj_cols))

In [40]:
min_cols

['TRAVEL',
 'TASTE',
 'SPORTS',
 'BLACK VOICES',
 'WEDDINGS',
 'IMPACT',
 'WEIRD NEWS',
 'HOME & LIVING',
 'GREEN',
 'GOOD NEWS',
 'PARENTS',
 'STYLE & BEAUTY',
 'FOOD & DRINK',
 'ARTS',
 'WORLDPOST',
 'SCIENCE',
 'BUSINESS',
 'MONEY',
 'U.S. NEWS',
 'MEDIA',
 'LATINO VOICES',
 'TECH',
 'ENVIRONMENT',
 'FIFTY',
 'COMEDY',
 'PARENTING',
 'DIVORCE',
 'QUEER VOICES',
 'HEALTHY LIVING',
 'COLLEGE',
 'RELIGION',
 'EDUCATION',
 'CRIME',
 'WOMEN',
 'STYLE',
 'ARTS & CULTURE',
 'THE WORLDPOST',
 'WORLD NEWS',
 'CULTURE & ARTS']

In [41]:
df_min = news[news['classification'].isin(min_cols)]
df_maj = news[news['classification'].isin(maj_cols)]

In [42]:
news_down = pd.concat([maj_df, df_min]).sample(frac=1).reset_index()

In [43]:
news_down

Unnamed: 0,index,text,classification
0,98556,"The Detroit Free Press reported that Spencer, ...",WELLNESS
1,52116,A State Department foreign service officer sha...,QUEER VOICES
2,41025,Feeling agitated or stressed? Try this countin...,WELLNESS
3,11499,This is the hurdle for parents of kids on the ...,PARENTING
4,36294,Pope Francis addressed a group of immigrants a...,RELIGION
...,...,...,...
147230,150990,"Now at the Gerald Schoenfeld Theater, Broadway...",CULTURE & ARTS
147231,49534,7. You'll get the best seat in the house. Brid...,WEDDINGS
147232,7249,The decision is a move sought by Assad ally Ru...,WORLD NEWS
147233,156463,"World, meet Lorde of the rings.",TASTE


In [44]:
stratifier = StratifiedKFold(n_splits= 10)

In [45]:
# Create a new column in the DataFrame to store the fold number for each row
news_down['fold'] = -1

# Perform stratified sampling and assign fold numbers to each row
for fold_number, (train_indices, test_indices) in enumerate(stratifier.split(news_down, news_down["classification"])):
    news_down.loc[test_indices, 'fold'] = fold_number

In [47]:
for fold in news_down['fold'].unique():
    fold_df:pd.DataFrame = news_down[news_down['fold'] == fold]
    
    fold_df.to_json(f"../data/news-resample-{fold}.json",orient="records", lines = True, )

In [None]:
news_down.to_json("../data/news_resample_train.json",orient = "records", lines=True)

In [33]:
df_maj

Unnamed: 0,text,classification
0,"“The world is watching,” the singer said in a ...",ENTERTAINMENT
2,The president fought for equality for transgen...,POLITICS
3,Want more? Be sure to check out HuffPost Style...,STYLE & BEAUTY
5,"In Boulder, Colorado, and other places where a...",POLITICS
6,The program has cost $624 so far and is being ...,POLITICS
...,...,...
188553,This study is the latest in a growing body of ...,WELLNESS
188554,,ENTERTAINMENT
188563,"""Love Boat"" fans around the world are shedding...",TRAVEL
188567,“They were clearly treated that way ― it’s pre...,POLITICS


In [34]:
df_min

Unnamed: 0,text,classification
1,The conviction of former President Lula da Sil...,WORLD NEWS
4,Dr. Ibrahim (let's just agree to call him Mo) ...,IMPACT
8,"It takes practice, but you can learn to addres...",DIVORCE
9,"People are just mean."" Have you heard that bef...",RELIGION
10,Taking the bus might get a whole lot less terr...,TECH
...,...,...
188568,With the Supreme Court hearing arguments this ...,QUEER VOICES
188570,"Don't make it about Hillary, he says. But also...",MEDIA
188571,"Couch, who killed four people in a 2013 drunk-...",CRIME
188572,,RELIGION
