In [1]:
import pandas as pd
import numpy as np
import re
import collections

In [4]:
train = pd.read_csv("../data/raw/train.csv", header=None, names=["ratings", "reviews"])
test = pd.read_csv("../data/raw/test.csv", header=None, names=["ratings", "reviews"])

In [5]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.!?,])", r" \1 ", text)
    text = re.sub(r"[^A-Za-z?,.!]+", r" ", text)
    return text

In [6]:
by_rating = collections.defaultdict(list)
for _, row in train.iterrows():
    by_rating[row.ratings].append(row.to_dict())

In [7]:
train_size = 5000
val_size = 500
test_size = 1000
seed = 1337

np.random.seed(seed)

final_list = []

# First, split the data into training and validation sets
for _, item_list in by_rating.items():
    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    
    if n_total > train_size + val_size:
        n_train = min(train_size, n_total)
        n_val = min(val_size, n_total - n_train)
    else:
        n_train = min(train_size, n_total)
        n_val = min(val_size, max(0, n_total - n_train))
    
    # Assign splits
    for item in item_list[:n_train]:
        item["split"] = "train"
    
    for item in item_list[n_train:n_train + n_val]:
        item["split"] = "val"
    
    final_list.extend(item_list[:n_train + n_val])

# Sampling test data
test_list = []
for _, row in test.iterrows():
    if len(test_list) < test_size:
        row_dict = row.to_dict()
        row_dict['split'] = 'test'
        test_list.append(row_dict)

final_list.extend(test_list[:test_size])

# Create DataFrame
final_reviews = pd.DataFrame(final_list)


In [8]:
final_reviews.split.value_counts()

split
train    10000
val       1000
test      1000
Name: count, dtype: int64

In [9]:
final_reviews.reviews = final_reviews.reviews.apply(preprocess_text)

In [10]:
final_reviews.ratings = final_reviews.ratings.apply({1: 'negative', 2: 'positive'}.get)

In [11]:
final_reviews.head()

Unnamed: 0,ratings,reviews,split
0,negative,the entrance was the impressive thing about th...,train
1,negative,"i m a mclover , and i had no problem nwith the...",train
2,negative,"less than good here , not terrible , but i see...",train
3,negative,i don t know if i can ever bring myself to go ...,train
4,negative,food was ok good but the service was terrible ...,train


In [13]:
final_reviews.to_csv("../data/processed/test_data_processed.csv", index=False)