##Connect to drive

In [12]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [13]:
!pwd
# Change directory
%cd ../content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
!pwd

/content/gdrive/My Drive/Colab Notebooks/Kaggle/NLP0
[Errno 2] No such file or directory: '../content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0'
/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
/content/gdrive/My Drive/Colab Notebooks/Kaggle/NLP0


# Setup

In [14]:
import pandas as pd

from pprint import pprint
import gensim
import gensim.corpora as corpora
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

# Train

## Load data

In [15]:
sent = pd.read_csv("data/sent.csv") # sentences ds
# words_stp = pd.read_csv("data/words_stp.csv") # words ds in long format
# word_w = pd.read_csv("data/word_w.csv") # words ds in wide format
# word_ste_w = pd.read_csv("data/word_ste_w.csv") # stemmed words (roots) ds in wide format

In [16]:
sent['clean_tweet'] = sent['clean_tweet'].fillna("")
sent[sent.clean_tweet.isna()]

Unnamed: 0.1,Unnamed: 0,text,clean_tweet,retweeted,mentioned,hashtags


## Model

In [17]:
# Convert text into matrix of token counts using CountVectorizer
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# extract matrix of counts
tf = vectorizer.fit_transform(sent['clean_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()



In [18]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [19]:
number_of_topics = 2
model = LDA(n_components=number_of_topics, random_state=0)

model.fit(tf)

LatentDirichletAllocation(n_components=2, random_state=0)

In [None]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights
0,fire,363.2,amp,343.4
1,û,345.0,like,323.3
2,bomb,227.5,get,309.1
3,via,219.7,go,196.8
4,new,184.6,peopl,193.8
5,video,174.3,one,193.5
6,crash,164.4,burn,153.3
7,disast,159.6,day,150.1
8,kill,156.1,emerg,144.8
9,bodi,155.4,love,139.7


In [23]:
model.components_.shape

(7613, 6)