In [1]:
from pathlib import Path
from tqdm import tqdm

import sys
import csv
import pandas as pd
import dask.dataframe as dd

in_file = Path('data/cleaned_news_data.csv')
out_file = Path('data/dropped_news_data.csv')
test_file = Path('data/test_news_data.csv')

category_dict = {'bias': 0, 'clickbait': 1, 'conspiracy': 2, 'fake': 3, 'hate': 4, 'junksci': 5,
                'political': 6, 'reliable': 7, 'rumor': 8, 'satire': 9, 'state': 10, 'unreliable': 11}

chunksize = 100000

In [2]:
# https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072

maxInt = sys.maxsize
decrement = True

while decrement:
    # decrease the maxInt value by factor 10 
    # as long as the OverflowError occurs.
    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt/10)
        decrement = True

In [3]:
reader = pd.read_csv(in_file, chunksize=chunksize, encoding='utf-8', engine='python')

for i, chunk in enumerate(tqdm(reader)):

    # copy null cells to new df and delete them
    cells = chunk[chunk['category'].isnull()].index.tolist()
    
    test_df = pd.DataFrame(columns=chunk.columns)
    rows = chunk.loc[cells, :]
    test_df = test_df.append(rows, ignore_index=True)
    chunk = chunk.dropna()
    
    # remap category names to numbers
    chunk['category'] = chunk['category'].map(category_dict)
    chunk['category'] = chunk['category'].astype('int32', copy=False)

    if not out_file.exists() or not test_file.exists():
        chunk.to_csv(out_file, index=False)
        test_df.to_csv(test_file, index=False)
    else:
        chunk.to_csv(out_file, index=False, mode='a', header=False)
        test_df.to_csv(test_file, index=False, mode='a', header=False)

86it [14:54, 10.01s/it]
