# Introduction
In this notebook, we preprocess the text data

# Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [47]:
import json
import re

import boto3
import sagemaker
import pandas as pd
import matplotlib.pyplot as plt

import nltk

from covid.config import Config

# Setup

In [7]:
config = Config('../config.json')

bucket = 'mleila-covid'
prefix = 'models/'
region_name = boto3.Session().region_name

sess = sagemaker.Session()
role = config.SM_ROLE

# Load Data

In [37]:
# download file
s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file('raw/covid/articles.txt', 'articles.txt')

# read file
with open('articles.txt', 'r') as f:
    data = f.readlines()

# Processing

## Remove Stop Words

In [118]:
from nltk.corpus import stopwords

# first time, download stopwords
#nltk.download('stopwords');

stop_words = stopwords.words('english')
clean_data = [' '.join([w for w in sentence.split() if w not in stop_words]) for sentence in data]

In [119]:
print(data[0])
print(clean_data[0])

Merch vendors are selling novelty shirts that have to do with the new coronavirus or COVID-19. The respiratory infection has already driven face mask vendors to raise prices, and now shirt sellers see an opportunity for profit. 

Merch vendors selling novelty shirts new coronavirus COVID-19. The respiratory infection already driven face mask vendors raise prices, shirt sellers see opportunity profit.


## Lowercase everything

In [120]:
clean_data = [sentence.lower() for sentence in clean_data]

# Remove punctuation

In [121]:
def remove_punct(text: str):
    """
    Remove punctuations using regex rules.
    """
    # remove tbc dots
    text = re.sub('…', '', text)
    # remove urls
    text = re.sub('http\S+', '', text, flags=re.MULTILINE)
    # remove periods and commas, etc.
    text = (text
            .replace('.', '')
            .replace(',', '')
            .replace(':', '')
            .replace('#', '')
            .replace('"', '')
            .replace("'", '')
            .replace(")", '')
            .replace("(", '')
           ) 
    # remove single character words
    text = ' '.join([word for word in text.split() if len(word)>1])
    return text

In [122]:
clean_data = [remove_punct(sentence) for sentence in clean_data]

In [124]:
clean_data[30:32]

['apple google taking measures prevent spread coronavirus misinformation apps according report cnbc apple one rejecting coronavirus-related mobile apps government official health organizati',
 'timeline coronavirus covid-19 canada ctv news husband toronto woman returned iran ontarios sixth covid-19 case cp24 torontos breaking news husband ontarios fifth covid-19 patient also tests positive virus ctv news husband cov']

# Write Data back to S3

In [134]:
with open('clean_articles.txt', 'w') as f:
    for line in data:
        f.writelines(line)

In [135]:
s3.Bucket(bucket).upload_file('clean_articles.txt', 'processed/covid/articles.txt')