In [5]:
import collections
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from argparse import Namespace

In [6]:
args = Namespace(
    raw_train_dataset_csv="../data/yelp/raw_train.csv",
    raw_test_dataset_csv="../data/yelp/raw_test.csv",
    train_proportion=0.7,
    val_proportion=0.3,
    output_munged_csv="../data/yelp/reviews_with_splits_full.csv",
    seed=1337
)

# Preprocess data

In [7]:
# Read raw data
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])
train_reviews = train_reviews[~pd.isnull(train_reviews.review)]
test_reviews = pd.read_csv(args.raw_test_dataset_csv, header=None, names=['rating', 'review'])
test_reviews = test_reviews[~pd.isnull(test_reviews.review)]

In [8]:
train_idx, val_idx = train_test_split(train_reviews.index, test_size = args.val_proportion)

In [10]:
train_reviews.loc[train_idx, 'split'] = 'train'
train_reviews.loc[val_idx, 'split'] = 'val'
test_reviews['split'] = 'test'

final_reviews = pd.concat([train_reviews, test_reviews])

In [11]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,1,"Unfortunately, the frustration of being Dr. Go...",train
1,2,Been going to Dr. Goldberg for over 10 years. ...,train
2,1,I don't know what Dr. Goldberg was like before...,train
3,1,I'm writing this review to give you a heads up...,train
4,2,All the food is great here. But the best thing...,val


In [12]:
final_reviews.split.value_counts()

train    392000
val      168000
test      38000
Name: split, dtype: int64

In [18]:
# Preprocess the reviews
# regex: regular expression

def preprocess_text(text):
    if type(text) == float:
        print(text)
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [19]:
final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)

In [20]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,"unfortunately , the frustration of being dr . ...",train
1,positive,been going to dr . goldberg for over years . i...,train
2,negative,i don t know what dr . goldberg was like befor...,train
3,negative,i m writing this review to give you a heads up...,train
4,positive,all the food is great here . but the best thin...,val


In [21]:
final_reviews.to_csv(args.output_munged_csv, index=False)