In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('../input/elon-musks-tweets/data_elonmusk.csv',encoding='latin1')
data = data[['Tweet']]
data.head()


Unnamed: 0,Tweet
0,@MeltingIce Assuming max acceleration of 2 to ...
1,RT @SpaceX: BFR is capable of transporting sat...
2,@bigajm Yup :)
3,Part 2 https://t.co/8Fvu57muhM
4,Fly to most places on Earth in under 30 mins a...


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [4]:
corpus = []

for i in range(0, data.shape[0]):
    tweet = re.sub('[^a-zA-Z]', ' ', data['Tweet'][i])
    tweet = tweet.lower()
    tweet = word_tokenize(tweet)
    # Reduce words to their root form
    tweet = [WordNetLemmatizer().lemmatize(w) for w in tweet if not w in set(stopwords.words('english'))]
    # Lemmatize verbs by specifying pos
    tweet = [WordNetLemmatizer().lemmatize(w, pos='v') for w in tweet if not w in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)
    corpus.append(tweet)

In [5]:
data['Cleaned Tweets']= corpus
data.head()

Unnamed: 0,Tweet,Cleaned Tweets
0,@MeltingIce Assuming max acceleration of 2 to ...,meltingice assume max acceleration g comfortab...
1,RT @SpaceX: BFR is capable of transporting sat...,rt spacex bfr capable transport satellite orbi...
2,@bigajm Yup :),bigajm yup
3,Part 2 https://t.co/8Fvu57muhM,part http co fvu muhm
4,Fly to most places on Earth in under 30 mins a...,fly place earth min anywhere cost per seat htt...


In [6]:
data = data.drop(['Tweet'],axis=1)
data.head()

Unnamed: 0,Cleaned Tweets
0,meltingice assume max acceleration g comfortab...
1,rt spacex bfr capable transport satellite orbi...
2,bigajm yup
3,part http co fvu muhm
4,fly place earth min anywhere cost per seat htt...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorizer=TfidfVectorizer(max_features=1000)
tfIdf = tfIdfVectorizer.fit_transform(corpus)

In [8]:
print(tfIdf.shape)
print(tfIdf)

(3218, 1000)
  (0, 148)	0.16284148880681937
  (0, 408)	0.16043505292758106
  (0, 484)	0.3474697540107758
  (0, 300)	0.5062179655600014
  (0, 4)	0.5776220108847048
  (0, 524)	0.4869334628558575
  (1, 547)	0.29911899674811776
  (1, 542)	0.2656301669214715
  (1, 160)	0.2927309626103846
  (1, 813)	0.2524300982776499
  (1, 799)	0.2405704364533285
  (1, 124)	0.32540869416061585
  (1, 188)	0.3120031218593433
  (1, 597)	0.27314000439218766
  (1, 738)	0.2705093144348583
  (1, 897)	0.3294085538455777
  (1, 119)	0.34967028144973716
  (1, 802)	0.17648202084174694
  (1, 730)	0.14049630630811794
  (2, 998)	1.0
  (3, 612)	0.906379746379614
  (3, 148)	0.3009424503625335
  (3, 408)	0.29649518870062724
  (4, 748)	0.37673353904317436
  (4, 617)	0.36731064205525843
  :	:
  (3214, 408)	0.3576071782769709
  (3215, 75)	0.3595619360187262
  (3215, 133)	0.6422651466908904
  (3215, 264)	0.33677150505187164
  (3215, 352)	0.2171284259384925
  (3215, 741)	0.5041579266628073
  (3215, 727)	0.20850822652889206
  (321

In [9]:
from sklearn.decomposition import LatentDirichletAllocation
LDA_model=LatentDirichletAllocation(n_components=10,learning_method='online',
                                    random_state=42,max_iter=1) 
lda_top=LDA_model.fit_transform(tfIdf)


In [10]:
print(lda_top.shape)
print(lda_top)

(3218, 10)
[[0.03085005 0.03086529 0.03084997 ... 0.72228122 0.03085247 0.03084988]
 [0.02208678 0.02208708 0.02208749 ... 0.11035964 0.14496409 0.02208678]
 [0.05       0.05       0.05       ... 0.05       0.05       0.05      ]
 ...
 [0.03059609 0.03059723 0.03059608 ... 0.03059801 0.5115445  0.24367715]
 [0.02626797 0.02627319 0.23657458 ... 0.2417908  0.3377445  0.0262677 ]
 [0.02473022 0.02473162 0.02473011 ... 0.40912179 0.39302788 0.0247302 ]]


In [11]:
print("Tweet 0: ")
for i,topic in enumerate(lda_top[0]):
  print("Topic ",i,": ",topic*100,"%")

Tweet 0: 
Topic  0 :  3.0850051612788354 %
Topic  1 :  3.086529176801379 %
Topic  2 :  3.084996695078226 %
Topic  3 :  3.0850792638059414 %
Topic  4 :  3.0898532865211936 %
Topic  5 :  3.085142935823786 %
Topic  6 :  3.085036584530033 %
Topic  7 :  72.22812175350876 %
Topic  8 :  3.085247398010872 %
Topic  9 :  3.084987744640991 %


so it's clear that topic 7 is the dominant topic in the first tweet

In [12]:
print(LDA_model.components_)
print(LDA_model.components_.shape) 

[[0.13604461 0.13507133 0.13342439 ... 0.13871575 0.13071592 3.70264297]
 [0.13843829 0.13762983 1.01048858 ... 0.13453394 0.13576778 0.31360089]
 [0.13122908 0.13400842 0.13687301 ... 0.139168   0.131691   0.13351702]
 ...
 [6.36484877 0.67131585 3.07161612 ... 0.17045915 0.13906939 0.16321635]
 [0.21313864 2.34862899 2.44643931 ... 0.1648857  0.13279309 1.42044855]
 [0.1305678  0.13111178 0.13944794 ... 0.1637143  0.12800788 0.13399043]]
(10, 1000)


In [13]:
vocab = tfIdfVectorizer.get_feature_names()

columns = []
words = []

for i, comp in enumerate(LDA_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)
    sorted_words = dict(sorted_words)
    topic = "Topic "+str(i)
    columns.append(topic)
    words.append(sorted_words.keys())


In [14]:
topics_relevant_words = pd.DataFrame(words).transpose()
topics_relevant_words.columns = columns
topics_relevant_words.head(20)


Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9
0,plus,co,trip,short,great,thank,yes,http,http,break
1,show,talk,definitely,co,little,elonmusk,space,co,co,picture
2,data,http,possible,http,news,article,station,exactly,rt,call
3,bill,super,many,lot,co,http,mission,model,launch,day
4,latest,life,need,spacex,move,co,dragon,drive,rocket,sale
5,teslaroadtrip,get,problem,work,http,actually,nasa,tesla,spacex,europe
6,almost,one,could,team,newscientist,charge,course,rt,land,tax
7,wish,good,road,appreciate,youtube,tesla,video,like,falcon,judge
8,believe,climate,piece,well,million,name,true,car,teslamotors,bbc
9,giant,yeah,confirm,hyperloop,article,rt,head,teslamotors,tesla,vehicle
