In [None]:
import re
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer

In [None]:
df = pd.read_csv('npr.csv')

In [None]:
df.head()

## Data Cleaning

In [None]:
### Lowercasing ### 
df['Article'] = df['Article'].str.lower()

In [None]:
### Using correct apostrophe ###
def replace_apostrophe(text):
    apostrophe = re.compile(r"’")
    return apostrophe.sub(r"'", text)

df['Article'] = df['Article'].map(lambda x: replace_apostrophe(x))

In [None]:
### Removing hashtag words: (#brexit, #dumpsterfyre) ###  
def remove_hash_words(text):
    hashtag_words = re.compile(r"#[A-Za-z0-9]+")
    return hashtag_words.sub(r"", text)

df['Article'] = df['Article'].map(lambda x: remove_hash_words(x))

In [None]:
### Remove Stopwords ###
stop = set(stopwords.words("english"))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]

    return " ".join(text)

df['Article'] = df['Article'].map(lambda x: remove_stopwords(x))

In [None]:
### Remove Punctuation ###
def remove_punct(text):
    table = str.maketrans("", "", string.punctuation)
    return text.translate(table)

df['Article'] = df['Article'].map(lambda x: remove_punct(x))

## Creating Corpus

In [None]:
%%time
def into_tokens(text):
    tokens = TweetTokenizer().tokenize(text)
    return tokens

df['Article'] = df['Article'].map(lambda x: into_tokens(x))

In [None]:
def create_corpus(df):
    
    corpus = []
    
    for x in df['Article']:
        for i in x:
            corpus.append(i)
    
    return corpus

In [None]:
corpus = create_corpus(df)

In [None]:
corpus[:5]

### Vocabulary and Word Frequency

In [None]:
fdist = nltk.FreqDist(word for word in corpus)

In [None]:
print("Vocabulary Size: ", len(fdist))
print("Most frequet words: ", fdist.most_common(100))