# Text Processing

##### The Goal of this is to clean up the documents via lemmatization, tokenization,  stop word removal, removal of pronouns, and removal of punctuation.

In [16]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
from string import punctuation

In [17]:
nlp = spacy.load('en')
# if this doesn't work write python -m Spacy download en

In [18]:
# empty wordlist for finished product
wordlist = []


In [19]:
# Import our dataset from the previous notebook
data = pd.read_csv('articleSummaryandPublishDate.csv', sep=',', encoding='utf-8')

In [20]:
data.head()
# The data we want is in the 'ArticleText' column

Unnamed: 0.1,Unnamed: 0,ArticleText,PublishDate
0,0,Our live coverage of the coronavirus outbreak ...,2020-02-01
1,1,Please click here to read more live updates on...,2020-02-02
2,2,Our live coverage of the Wuhan coronavirus out...,2020-02-03
3,3,Our live coverage of the coronavirus outbreak ...,2020-02-04
4,4,Our live coverage of the coronavirus outbreak ...,2020-02-05


In [21]:
# create variable for final dataframe 
finalcolumnnames = ['ArticleText', 'PublishDate', 'CleanedArticleText']

In [22]:
type(data)
# set finaldf variable for the end dataframe
finaldf = pd.DataFrame(columns=finalcolumnnames)

In [23]:
for i in range(len(df)):
    articletextvar = df.loc[i, "ArticleText"]
    publishdatevar = df.loc[i, "PublishDate"]

    dataAfterLemmaFilter = []
    dataAfterPronounFilter = []
    dataAfterStopwords = []
    dataAfterPunctuations = []
    dataAfterNounFilter = []
    
    # apply spacy model to the article text column
    # to tokenize the document
    doc = nlp(str(articletextvar))
    
    # lemmatize each token 
    for token in doc:
        dataAfterLemmaFilter.append(token.lemma_)
    
    # remove pronouns
    for token in dataAfterLemmaFilter:
        if token != "-PRON-":
            dataAfterPronounFilter.append(token.lower().strip())
   
    # remove stopwords from spacy's list of stop words
    stopwords = list(STOP_WORDS)
    
    # remove stop words 
    for token in dataAfterPronounFilter:
        if token != stopwords:
            dataAfterStopwords.append(token)    
            
    # custom list of words I noticed were in the dataset and wanted removed
    custom_remove_list = ['our', 'live', 'coverage', 'of', 'the', 'has', 'moved', 'here', '\\n', 'http', ':', '//']
    dataAfterCustomStopWords = []
    for word in dataAfterStopwords:
        if word not in custom_remove_list:
            dataAfterCustomStopWords.append(word)
            
    # for some reason the word coverage stayed in my list, so I added another loop specifically to remove 
    # just that word
    
    dataAfterRemoveCoverage = []
    # i used c because I am out of iterables
    for c in dataAfterCustomStopWords:
        if c != 'coverage':
            dataAfterRemoveCoverage.append(c)
            
    # use 'string' library's punctuation list to remove all punctuation
    punctuations = punctuation

    for token in dataAfterRemoveCoverage:
        if token not in punctuations:
            dataAfterPunctuations.append(token)
    
    # remove noune using the 'en' model to find all the nouns
    for value in dataAfterPunctuations:
        td = nlp(value)
        for t in td:
            if t.pos_ == 'NOUN':
                dataAfterNounFilter.append(t)
                
    # initialize empty list for all the tokens to be reassembled
    dataAfterNounFilterStringFormatting = []
    
    # append each object to a string which is then appended to a list to be put into the dataframe
    for nlpObject in dataAfterNounFilter:
        nlpObjectIntoString=str(nlpObject)
        dataAfterNounFilterStringFormatting.append(nlpObjectIntoString)
    # rename for ease of use 
    cleaned_data_list = dataAfterNounFilterStringFormatting

    # append the data to a temp dataframe 'df2' 
    df2 = pd.DataFrame({"ArticleText": [articletextvar],
                        "PublishDate": [publishdatevar],
                        "CleanedArticleText": [cleaned_data_list]})
    # append the temp df to the final df 
    finaldf = finaldf.append(df2, ignore_index=True)

In [24]:
# show the results
finaldf.head()

Unnamed: 0,ArticleText,PublishDate,CleanedArticleText
0,Our live coverage of the coronavirus outbreak ...,2020-02-01,"[end, today, people, mainland, end, country, h..."
1,Please click here to read more live updates on...,2020-02-02,"[update, field, departure, assistance, affairs..."
2,Our live coverage of the Wuhan coronavirus out...,2020-02-03,"[hospital, statement, isolation, home, health,..."
3,Our live coverage of the coronavirus outbreak ...,2020-02-04,"[health, people, death, toll, people, death, t..."
4,Our live coverage of the coronavirus outbreak ...,2020-02-05,"[number, case, increase, end, day, health, num..."


In [26]:
# Append dataframe to csv 
finaldf.to_csv('articleCleanedText.csv', sep=',', encoding='utf-8')

##### The reason I did things the way I did was because I wanted to get alll the articles cleaned at once without having to separate this out into different notebooks. It was tough because I originally thought storing the data in a list inside of a dataframe would be smart, that way I could just extract it as a list for future use, but it turned out that's not the case, and it just made it so I have to clean it every time I need to use the list. I left it because it just shows the process of how to do it again if necessary 