<a href="https://colab.research.google.com/github/mori8/NLP-Pytorch-practice/blob/main/ch3_yelp_dataset_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import torch
import numpy as n
import pandas as pd
import regex as re
import torch.nn as nn
import torch.nn.functional as F
import collections

from argparse import Namespace

In [35]:
args = Namespace(
    raw_train_dataset_csv="/content/drive/MyDrive/NLP-Pytorch/data/yelp/raw_train.csv",
    raw_test_dataset_csv="/content/drive/MyDrive/NLP-Pytorch/data/yelp/raw_test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="/content/drive/MyDrive/NLP-Pytorch/data/yelp/reviews_with_splits_lite.csv",
    seed=1337
)

In [24]:
# 원본 데이터 읽기
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [25]:
# 리뷰 클래스 비율이 모두 동일하게 만들기
by_rating = collections.defaultdict(list)
for _, row in train_reviews.iterrows():
  by_rating[row.rating].append(row.to_dict())

review_subset = []

for _, item_list in sorted(by_rating.items()):
  n_total = len(item_list)
  n_subset = int(args.proportion_subset_of_train * n_total)
  review_subset.extend(item_list[:n_subset])

review_subset = pd.DataFrame(review_subset)

In [26]:
review_subset.head()  # 각 rating 별로 동일한 비율로 서브셋에 들어감

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,1,Wing sauce is like water. Pretty much a lot of...
4,1,Owning a driving range inside the city limits ...


In [27]:
train_reviews.rating.value_counts()

2    280000
1    280000
Name: rating, dtype: int64

In [28]:
# 고유 클래스
set(review_subset.rating)

{1, 2}

### 훈련, 검증, 테스트 세트 만들기

In [29]:
# 별점 기준으로 나누어 훈련, 검증, 테스트 데이터셋을 만든다.
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
  by_rating[row.rating].append(row.to_dict())

# 분할 데이터를 만든다.
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):
  np.random.shuffle(item_list)
  n_total = len(item_list)
  n_train = int(args.train_proportion * n_total)
  n_val = int(args.val_proportion * n_total)
  n_test = int(args.test_proportion * n_total)

  for item in item_list[:n_train]:
    item['split'] = 'train'
  
  for item in item_list[n_train:n_train+n_val]:
    item['split'] = 'val'
  
  for item in item_list[n_train+n_val:n_train+n_val+n_test]:
    item['split'] = 'test'
  
  final_list.extend(item_list)

final_reviews = pd.DataFrame(final_list)

In [30]:
final_reviews.split.value_counts()

train    39200
val       8400
test      8400
Name: split, dtype: int64

### 데이터 정제 작업
구두점 기호 앞뒤에 공백을 넣고, 구두점이 아닌 기호를 제거한다.

In [31]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub(r"([.,!?])", r" \1 ", text)
  text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
  
  return text

final_reviews.review = final_reviews.review.apply(preprocess_text)

In [32]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [33]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train


In [36]:
final_reviews.to_csv(args.output_munged_csv, index=False)