In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import collections
import re
from argparse import Namespace

from pathlib import Path

In [2]:
p = Path.home() / 'exercisebook_large_data' / 'Yelp_data' / 'Yelp'

In [3]:
args = Namespace(
    raw_train_dataset_csv=p / 'raw_train.csv',
    raw_test_dataset_csv=p / 'raw_test.csv',
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv=p / 'reviews_with_splits_lite.csv',
    seed=1337
)

In [4]:
# 원본 데이터를 읽습니다
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])

In [5]:
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())

In [6]:
review_subset = []

In [7]:
for _, item_list in sorted(by_rating.items()):
    n_total = len(item_list)
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset])

In [8]:
review_subset = pd.DataFrame(review_subset)
review_subset

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,1,Wing sauce is like water. Pretty much a lot of...
4,1,Owning a driving range inside the city limits ...
...,...,...
55995,2,I am not really an arts and crafts kind of guy...
55996,2,i absolutely love michael's! i used to scrapbo...
55997,2,The fact that I can generally get whatever I w...
55998,2,I've been frequenting Michaels lately because ...


In [9]:
train_reviews['rating'].value_counts(), review_subset['rating'].value_counts()

(1    280000
 2    280000
 Name: rating, dtype: int64,
 1    28000
 2    28000
 Name: rating, dtype: int64)

In [10]:
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())

In [11]:
final_list = []
np.random.seed(args.seed)

In [12]:
for _, item_list in sorted(by_rating.items()):
    np.random.shuffle(item_list)
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = n_total - n_train - n_val

    for item in item_list[:n_train]:
        item['split'] = 'train'
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    final_list.extend(item_list)

In [13]:
final_reviews = pd.DataFrame(final_list)
final_reviews

Unnamed: 0,rating,review,split
0,1,Terrible place to work for I just heard a stor...,train
1,1,"3 hours, 15 minutes-- total time for an extrem...",train
2,1,My less than stellar review is for service. ...,train
3,1,I'm granting one star because there's no way t...,train
4,1,The food here is mediocre at best. I went afte...,train
...,...,...,...
55995,2,"Great food. Wonderful, friendly service. I ...",test
55996,2,Charlotte should be the new standard for moder...,test
55997,2,Get the encore sandwich!! Make sure to get it ...,test
55998,2,I'm a pretty big ice cream/gelato fan. Pretty ...,test


In [14]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'([.,!?])', r' \1 ', text)
    text = re.sub(r'[^a-zA-Z.,!?]', r' ', text)
    return text

In [15]:
final_reviews['review'] = final_reviews['review'].apply(preprocess_text)
final_reviews

Unnamed: 0,rating,review,split
0,1,terrible place to work for i just heard a stor...,train
1,1,"hours , minutes total time for an extr...",train
2,1,my less than stellar review is for service . ...,train
3,1,i m granting one star because there s no way t...,train
4,1,the food here is mediocre at best . i went af...,train
...,...,...,...
55995,2,"great food . wonderful , friendly service ....",test
55996,2,charlotte should be the new standard for moder...,test
55997,2,get the encore sandwich ! ! make sure to get...,test
55998,2,i m a pretty big ice cream gelato fan . prett...,test


In [16]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)
final_reviews

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extr...",train
2,negative,my less than stellar review is for service . ...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went af...,train
...,...,...,...
55995,positive,"great food . wonderful , friendly service ....",test
55996,positive,charlotte should be the new standard for moder...,test
55997,positive,get the encore sandwich ! ! make sure to get...,test
55998,positive,i m a pretty big ice cream gelato fan . prett...,test


In [17]:
final_reviews.to_csv(args.output_munged_csv, index=False)