In [1]:
import sys
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [2]:
args = Namespace(
    raw_train_dataset_csv="data/yelp/raw_train.csv",
    raw_test_dataset_csv="data/yelp/raw_test.csv",
    proportion_subset_of_train=0.1,
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/yelp/reviews_with_splits_lite.csv",
    seed=1337
)

In [3]:
# 원본 데이터를 읽습니다
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])

In [4]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [21]:
# 리뷰 클래스 비율이 동일하도록 만듭니다 ; 이미 원본 데이터가 동일한데 왜 하는거지..뭐지...

# rate - row(rating, reivew)
# 1 - [{rating: 1, review: Unfortunately,,,}]
by_rating = collections.defaultdict(list) ### collections.defaultdict 초면! 대박..
# print(by_rating)
for _, row in train_reviews.iterrows():
    by_rating[row.rating].append(row.to_dict())
    
# print(by_rating[1][0]['rating'])

review_subset = []

for _, item_list in sorted(by_rating.items()):

    n_total = len(item_list) 
    n_subset = int(args.proportion_subset_of_train * n_total)
    review_subset.extend(item_list[:n_subset]) # extend method ...
    
review_subset = pd.DataFrame(review_subset)

In [22]:
review_subset.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,1,I don't know what Dr. Goldberg was like before...
2,1,I'm writing this review to give you a heads up...
3,1,Wing sauce is like water. Pretty much a lot of...
4,1,Owning a driving range inside the city limits ...


In [28]:
train_reviews.rating.value_counts()

1    280000
2    280000
Name: rating, dtype: int64

In [13]:
# 고유 클래스
set(review_subset.rating)

{1, 2}

In [14]:
# 훈련, 검증, 테스트를 만들기 위해 별점을 기준으로 나눕니다 ~~~~ 여기서부터봐야함
by_rating = collections.defaultdict(list)
for _, row in review_subset.iterrows():
    by_rating[row.rating].append(row.to_dict())

# 분할 데이터를 만듭니다.
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_rating.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # 데이터 포인터에 분할 속성을 추가합니다
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # 최종 리스트에 추가합니다
    final_list.extend(item_list)

In [15]:
# 분할 데이터를 데이터 프레임으로 만듭니다
final_reviews = pd.DataFrame(final_list)

In [16]:
final_reviews.split.value_counts()

train    39200
val       8400
test      8400
Name: split, dtype: int64

In [17]:
# 리뷰를 전처리합니다
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [18]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [19]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train


In [20]:
final_reviews.to_csv(args.output_munged_csv, index=False)