In [1]:
import os
import re
import numpy as np
import pandas as pd
from pathlib import Path
from imdb import Cinemagoer

In [2]:
CWD = Path(os.getcwd()).parent
DATA_DIR = Path('data/aclimdb/')
REVIEW_FILE_PATTERN = r"(\d+)_(\d+)\.txt"
IMDB_URL_PATTERN = r"http://www.imdb.com/title/tt(\d+)/usercomments"

OUTPUT_LOC = Path('data/imdb_review')

In [3]:
DATA_DIR = CWD / DATA_DIR
OUTPUT_LOC = CWD / OUTPUT_LOC

In [4]:
imdb_url_pattern = re.compile(IMDB_URL_PATTERN)

def get_imdb_urls(url_list_file: Path):
    imdb_urls = np.genfromtxt(url_list_file, dtype=str, delimiter='\n').tolist()
    return list(map(
        lambda url: imdb_url_pattern.search(url).group(1),
        imdb_urls
    ))

In [5]:
comment_file_pattern = re.compile(REVIEW_FILE_PATTERN)

def get_sentiment_data(sentiment: str, data_path: Path) -> pd.DataFrame:
    comment_url_id = get_imdb_urls(data_path / f'urls_{sentiment}.txt')
    data_path = data_path / sentiment

    comment_df = pd.DataFrame(
        columns=['Movie Id', 'Rating', 'Review', 'Sentiment'], 
        index=pd.Index([], name='ID')
    )

    comment_files = os.listdir(data_path)
    for comment_file in comment_files:
        cId, rating = comment_file_pattern.search(comment_file).groups()
        cId = int(cId)
        
        with open(data_path/comment_file) as f: comment = f.read()
        
        comment_df.loc[cId] = [
            comment_url_id[cId],
            rating,
            comment,
            +1 if sentiment == 'pos' else -1
        ]
    
    return comment_df

In [6]:
def load_data(data_type: str):
    return pd.concat([
        get_sentiment_data('neg', DATA_DIR / data_type),
        get_sentiment_data('pos', DATA_DIR / data_type)
    ])

In [7]:
train_data = load_data('train')
test_data = load_data('test')

In [8]:
train_data.to_csv(OUTPUT_LOC / 'train.csv')
test_data.to_csv(OUTPUT_LOC / 'test.csv')