### 1. Cleaning Function

- lower case
- removal of punctuation
- removal of stopwords

In [None]:
import string 
from nltk.corpus import stopwords 
stopwords = stopwords.words("english")

def _clean(txt):
    txt = txt.lower()
    txt = "".join(x for x in txt if x not in string.punctuation)
    words = txt.split()
    words = [w for w in words if w not in stopwords]
    txt = " ".join(words)
    return txt

tweets["cleaned"] = tweets["text"].apply(lambda x : _clean(x))
tweets[["text", "cleaned"]]

    - 'tweets' is a dataframe
    - 'text' is a direct column from DF
    - 'cleaned' is a derived column after cleaning the text

### 2. Fetching Top 100 Repetetive Words 

In [None]:
## Keyword Analysis 
from collections import Counter
complete_text = " ".join(tweets["text"])
clean_text = _clean(complete_text)
Counter(clean_text.split()).most_common(100)

### 3. Fetching Top 100 Mentions (Personalities)

In [None]:
## Top Mentions 
mentions = [w for w in complete_text.split() if w.startswith("@")]
Counter(mentions).most_common(100) 

### 4. Fetching Top 100 Hashtags

In [None]:
htags = [w for w in complete_text.split() if w.startswith("#")]
htags = [w for w in htags if "demo" not in w.lower()]
Counter(htags).most_common(100) 

### 5. Fetching Top 100 URLS

In [None]:
htags = [w for w in complete_text.split() if w.startswith("http")]
htags = [w for w in htags if "demon" not in w.lower()]
Counter(htags).most_common(100) 

### 6. Fetching Top 100 Bigrams

In [None]:
from nltk import ngrams

bigrams = ngrams(clean_text.split(), 2)
Counter(bigrams).most_common(100)

### 7. Extracting Named Enities (NER)

In [None]:
## NER 
import nltk
from nltk import word_tokenize, pos_tag 
from nltk.chunk import tree2conlltags

for text in tweets["text"]:
    entities = nltk.ne_chunk(pos_tag(word_tokenize(text))) 
    for chunk in entities:
        if hasattr(chunk, "label"):
            if "GPE" in (str(chunk)):
                print (chunk)
            if "ORGANIZATION" in (str(chunk)):
                print (chunk)

In [None]:
# self-written while studying

import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

for text in tweets["text"]:
    entities = ne_chunk(pos_tag(word_tokenize(text)))
    for entity in entities:
        if hasattr(entity, "label"):
            print(entity)

### 8. Topic Modelling

In [None]:
## Topic Modelling 
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np 

def generate_topic_models(text):
    cvectorizer = CountVectorizer(min_df=4, max_features=2000)
    cvz = cvectorizer.fit_transform(text)

    lda_model = LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=20, random_state=42)
    X_topics = lda_model.fit_transform(cvz)

    topic_word = lda_model.components_ 
    vocab = cvectorizer.get_feature_names()
    return topic_word, vocab 

n_top_words = 10
topic_word, vocab = generate_topic_models(tweets["cleaned"].values)
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print ("Topic " + str(i+1) + ": " + " | ".join(topic_words) + "\n")

### 9. Extracting top 30 words visually

In [None]:
def freq_words(x, terms=30):
    
    # combine all the articles
    text =' '.join([text for text in x])
    
    #split text into words
    all_words = text.split()
    
    #prepare a dictionary of word-frequency pairs
    fdist = nltk.FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()),'count':list(fdist.values())})
    
    #select top n most frequent words
    
    d = words_df.nlargest(columns = 'count', n = terms)
    
    #plot the word counts
    
    plt.figure(figsize = (20,5))
    ax = sns.barplot(data = d, x ='word', y = 'count')
    
    ax.set(ylabel = 'Count')
    plt.show()
    

In [None]:
freq_words(clean_articles)