In [1]:
from pathlib import Path
from tqdm import tqdm

import sys
import csv
import numpy as np
import pandas as pd
import dask.dataframe as dd

in_file = Path('data/cleaned_news_data.csv.gz')
train_file = Path('data/train_news_data.csv')
valid_file = Path('data/valid_news_data.csv')
test_file = Path('data/test_news_data.csv')

chunksize = 100000

In [2]:
# https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072

maxInt = sys.maxsize
decrement = True

while decrement:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.
    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt/10)
        decrement = True

In [3]:
reader = pd.read_csv(in_file, chunksize=chunksize, encoding='utf-8', engine='python', compression='gzip')

for i, chunk in enumerate(tqdm(reader)):

    # copy null cells to new df and delete them
    cells = chunk[chunk['category'].isnull()].index.tolist()
    
    test_df = pd.DataFrame(columns=chunk.columns)
    rows = chunk.loc[cells, :]
    test_df = test_df.append(rows, ignore_index=True)
    
    test_df = test_df.drop('category', axis=1)
    chunk = chunk.dropna()
    
    # train, valid split
    train, valid = np.split(chunk.sample(frac=1).reset_index(drop=True), [int(.8 * len(chunk))])

    if not train_file.exists() or not valid_file.exists() or not test_file.exists():
        train.to_csv(train_file, index=False)
        valid.to_csv(valid_file, index=False)
        test_df.to_csv(test_file, index=False)
    else:
        train.to_csv(train_file, index=False, mode='a', header=False)
        valid.to_csv(valid_file, index=False, mode='a', header=False)
        test_df.to_csv(test_file, index=False, mode='a', header=False)

86it [17:31, 11.79s/it]
