##Connect to drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pwd
# Change directory
%cd ../content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
!pwd

/content
/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0


# Setup

In [None]:
import pandas as pd

from pprint import pprint
import gensim
import gensim.corpora as corpora
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

# Train

## Load data

In [None]:
sent = pd.read_csv("data/sent.csv") # sentences ds
#words_stp = pd.read_csv("data/words_stp.csv") # words ds in long format
#word_w = pd.read_csv("data/word_w.csv") # words ds in wide format
#word_ste_w = pd.read_csv("data/word_ste_w.csv") # stemmed words (roots) ds in wide format

In [None]:
sent['clean_tweet'] = sent['clean_tweet'].fillna("")
sent[sent.clean_tweet.isna()]

Unnamed: 0.1,Unnamed: 0,text,clean_tweet,retweeted,mentioned,hashtags


## Model

In [None]:
# https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

### LDA Option 1 - input 0-1 matrix

In [None]:
# Convert text into matrix of token counts using CountVectorizer
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# extract matrix of counts
tf = vectorizer.fit_transform(sent['clean_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()



In [None]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [None]:
number_of_topics = 2
model = LDA(n_components=number_of_topics, random_state=0)

model.fit(tf)

LatentDirichletAllocation(n_components=2, random_state=0)

In [None]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights
0,fire,363.2,amp,343.4
1,û,345.0,like,323.3
2,bomb,227.5,get,309.1
3,via,219.7,go,196.8
4,new,184.6,peopl,193.8
5,video,174.3,one,193.5
6,crash,164.4,burn,153.3
7,disast,159.6,day,150.1
8,kill,156.1,emerg,144.8
9,bodi,155.4,love,139.7


In [None]:
model.components_.shape

(2, 595)

### LDA Option 2 - input tf-idf matrix

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import numpy as np

In [None]:
sent_new = sent[sent.clean_tweet.notna()]
#sent[4495:4498]

In [None]:
documents_list= sent_new['clean_tweet'].tolist()

In [None]:
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

In [None]:
# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list) 

In [None]:
# Define the number of topics or components
num_components=2

# Create LDA object
model=LDA(n_components=num_components)

# Fit and Transform SVD model on data
lda_matrix = model.fit_transform(train_data)

# Get Components 
lda_components=model.components_

In [None]:
# Print the topics with their terms
terms = tfidf.get_feature_names()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['like', 'emerg', 'disast', 'love', 'feel', 'attack', 'obliter']
Topic 1:  ['bomb', 'û', 'burn', 'scream', 'amp', 'year', 'like']




In [None]:
# Fit the model with gensim
import gensim

# Create Dictionary
id2word = corpora.Dictionary(tst)
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in tst]

In [None]:
train_vecs = []
for i in range(len(documents_list)):
    top_topics = (
        model.get_document_topics(documents_list[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(20)]
    topic_vec.extend([documents_list.iloc[i].real_counts])
    topic_vec.extend([len(documents_list.iloc[i].text)])
    train_vecs.append(topic_vec)

In [None]:
tst = [list(words_stp[words_stp.idx==i].word_ste) for i in range(words_stp.idx.max())]

In [None]:
tst

In [None]:
format_topics_sentences(model, train_data, documents_list)