In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_json('../data/Musical_Instruments_5.json', lines = True)

### Define Target (Positive, Neutral, Negative) based on Overall Rating

- positive for 4 and 5 star overall rating
- neutral for 3 star rating
- negative for 1 and 2 star overall rating

- Target is not distributed uniformly. 
- we will use upsampling method to upsample minority classes 

In [3]:
df['target']=df['overall'].apply(lambda x: 'Positive' if x>=4 else 'Neutral' if x==3 else 'Negative')

In [4]:
df['target'].value_counts()

Positive    9022
Neutral      772
Negative     467
Name: target, dtype: int64

### Train test split

In [5]:
from sklearn.model_selection import train_test_split
train, test=train_test_split(df, test_size=0.3, random_state=42)

### verify train/test sets have all classes present

In [6]:
train.target.value_counts() 

Positive    6311
Neutral      543
Negative     328
Name: target, dtype: int64

In [7]:
test.target.value_counts()

Positive    2711
Neutral      229
Negative     139
Name: target, dtype: int64

## Upsampling in Training Set (leave test set untouched)

In [9]:
pos=train[train['target']=='Positive']
neg=train[train['target']=='Negative']
nue=train[train['target']=='Neutral']

In [14]:
neg_new=neg.sample(n=pos.shape[0],replace=True, random_state=123)
nue_new=nue.sample(n=pos.shape[0], replace=True, random_state=123)

In [15]:
train_balanced=pd.concat([pos,neg_new,nue_new])

In [19]:
train_balanced.target.value_counts()

Positive    6311
Negative    6311
Neutral     6311
Name: target, dtype: int64

In [20]:
train_balanced.to_csv('../data/train_balanced.csv',index=False)

In [22]:
test.to_csv('../data/test.csv', index=False)