##Connect to drive

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
!pwd
# Change directory
%cd ../content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
!pwd

/content
/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0
/content/gdrive/MyDrive/Colab Notebooks/Kaggle/NLP0


# Setup

In [21]:
import pandas as pd

from pprint import pprint
import gensim
import gensim.corpora as corpora
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

# Train

## Load data

In [34]:
sent = pd.read_csv("data/sent.csv")
# words_stp = pd.read_csv("data/words_stp.csv")
# word_w = pd.read_csv("data/word_w.csv")
# word_ste_w = pd.read_csv("data/word_ste_w.csv")

## Model

In [38]:
sent['clean_tweet'] = sent['clean_tweet'].fillna("")
sent[sent.clean_tweet.isna()]

Unnamed: 0.1,Unnamed: 0,text,clean_tweet


In [39]:
# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(sent['clean_tweet']).toarray()

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()



In [40]:
number_of_topics = 10
model = LDA(n_components=number_of_topics, random_state=0)

model.fit(tf)

LatentDirichletAllocation(random_state=0)

In [23]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [41]:
no_top_words = 10
display_topics(model, tf_feature_names, no_top_words)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,fire,287.7,burn,125.7,bodi,130.6,amp,340.6,emerg,164.1,via,156.2,bomb,237.8,time,146.5,like,376.2,go,145.2
1,live,92.0,build,108.7,bag,118.1,û,128.3,wreck,95.5,u,122.4,suicid,120.1,new,105.0,video,174.1,destroy,90.1
2,mani,67.2,would,101.4,news,103.8,flood,115.6,car,94.7,see,122.3,nuclear,103.1,storm,90.5,look,139.9,good,86.6
3,fuck,64.4,peopl,89.8,attack,82.4,get,102.1,plan,93.1,collaps,108.4,hiroshima,95.1,may,88.1,crash,87.1,get,84.4
4,reddit,63.1,day,83.2,wildfir,82.1,disast,88.3,murder,90.1,scream,99.0,gt,94.1,hijack,80.5,year,75.8,let,82.0
5,set,63.1,riot,78.0,train,79.4,w,85.0,fear,77.1,famili,97.7,one,87.7,drown,64.5,delug,65.1,run,68.8
6,quarantin,59.1,dead,77.6,home,78.1,devast,81.1,crash,77.1,fatal,94.3,kill,83.8,think,62.4,feel,61.0,say,60.5
7,forest,57.6,mh,74.1,derail,73.5,rt,75.6,mass,76.1,rescu,82.1,surviv,83.3,casualti,59.1,today,60.9,desol,59.1
8,updat,56.9,us,67.3,explod,69.2,fire,75.5,man,73.1,way,68.2,deton,81.3,oil,54.1,thank,60.6,polic,59.1
9,save,54.5,miss,67.2,fall,69.1,watch,70.0,warn,70.1,two,64.0,armi,72.1,earthquak,53.1,charg,53.1,want,59.1
