### Preprocessing 

In [1]:
from os import listdir
from os.path import isfile, join

import nltk
import time
import unidecode
import ftfy

import pandas as pd

In [2]:
files_2013 = ['2013/' + f for f in listdir('2013/') if isfile(join('2013/', f))]
files_2014 = ['2014/' + f for f in listdir('2014/') if isfile(join('2014/', f))]
all_files = files_2013 + files_2014


article_list = []
# clean and decode all files
start_time = time.time()
for index, file_name in enumerate(all_files):
    if index % 70 == 0:
        print("Percentage Completed: {0:.0%}".format(index / len(all_files)))
    with open(file_name, encoding = "latin-1") as f:
        sentences = [nltk.tokenize.sent_tokenize(line) for line in f]
        flat_list = [unidecode.unidecode(ftfy.fix_text(item)) for sublist in sentences for item in sublist]
    article_list.append(flat_list)

print("Percentage Completed: 100%")
print("Finished!")
print("Time to Complete:", time.time() - start_time)

Percentage Completed: 0%
Percentage Completed: 10%
Percentage Completed: 19%
Percentage Completed: 29%
Percentage Completed: 38%
Percentage Completed: 48%
Percentage Completed: 58%
Percentage Completed: 67%
Percentage Completed: 77%
Percentage Completed: 86%
Percentage Completed: 96%
Percentage Completed: 100%
Finished!
Time to Complete: 120.12251377105713


In [3]:
# write as one string to a file
all_articles_in_one = [x for sublist in article_list for x in sublist]
article_to_sentence = pd.DataFrame({'Article':article_list})
article_to_sentence.to_csv("articles_sentences.csv", index = False)
sentences = pd.DataFrame(all_articles_in_one)
sentences.to_csv("all_sentences.csv", index = False)


### Cleaning Percentages

In [4]:
percentages = pd.read_csv("all/percentage.csv", encoding = "latin-1", header = -1)[0].tolist()
# view head
percentages[:5]

['66%', '40%', '90%', '49%', '100%']

In [5]:
unique_characters = list(set(''.join(percentages)))
unique_characters[:5]
# weird characters = ['"', '*', '?']

['E', 'S', ',', 'H', '5']

In [6]:
def clean_percentage(percentage_string):
    chars_to_remove = ['"', '*', '?', ',', '(', ')']
    percentage_string.strip()
    percentage_string = unidecode.unidecode(percentage_string)
    percentage_string = percentage_string.lower()
    for char in chars_to_remove:
        if char in percentage_string:
            percentage_string = percentage_string.replace(char, "")
    return percentage_string.strip()

In [7]:
cleaned_percentages = [clean_percentage(x) for x in percentages]
percents_df = pd.DataFrame(cleaned_percentages)
percents_df.to_csv("cleaned_percents.csv", index = False)