# Data Preprocessing

Extract reviews from the source document and save as CSV for downstream notebooks.

In [15]:
import re
import subprocess
import pandas as pd
from pathlib import Path

## Extract reviews from .docx

In [9]:
DOCX_PATH = Path("../data/sentiment-analysis-nlp-dataset.docx")
CSV_PATH = Path("../data/reviews_extraidas.csv")

def read_docx(path):
    result = subprocess.run(['pandoc', str(path), '-t', 'plain'], capture_output=True, text=True, check=True)
    return result.stdout

def extract_reviews(text):
    pattern = r'(\d+)\s*/10\s*(.*?)\s*Helpful'
    matches = re.findall(pattern, text, re.DOTALL)
    reviews = []
    for score, body in matches:
        body = re.sub(r'\n{3,}', '\n\n', body.strip())
        reviews.append((body, int(score)))
    return reviews

text = read_docx(DOCX_PATH)
reviews = extract_reviews(text)

df = pd.DataFrame(reviews, columns=['X', 'Y'])
df.insert(0, 'id', range(1, len(df) + 1))

print(f"{len(df)} reviews extracted")
df['Y'].value_counts().sort_index()

544 reviews extracted


Y
1     111
2      56
3      58
4      50
5      63
6      59
7      55
8      36
9      22
10     34
Name: count, dtype: int64

## Save to CSV

In [10]:
df.to_csv(CSV_PATH, index=False)
print(f"Saved to {CSV_PATH}")

Saved to ../data/reviews_extraidas.csv
