In [None]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re


required = {'spacy', 'scikit-learn', 'numpy', 
            'pandas', 'torch', 'matplotlib', 'wordcloud'}

installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

import spacy
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from spacy.lang.en import English
!python -m spacy download en_core_web_md
import en_core_web_md
en = English()
nlp = en_core_web_md.load()

Collecting en_core_web_md==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.2.5/en_core_web_md-2.2.5.tar.gz (96.4MB)
[K     |████████████████████████████████| 96.4MB 1.2MB/s 
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-md: filename=en_core_web_md-2.2.5-cp36-none-any.whl size=98051305 sha256=d653e77c6a71984164b9ed3ef0ad7a3aba375fe80a0e97d52d222a1ec41c5735
  Stored in directory: /tmp/pip-ephem-wheel-cache-y6dkf_v9/wheels/df/94/ad/f5cf59224cea6b5686ac4fd1ad19c8a07bc026e13c36502d81
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


Trying to explore what the tweets are about

In [None]:
with open('clean_tweets_10k.pkl', 'rb') as f:
    df_tweet = pickle.load(f)
print(df_tweet.head())
df_tweet.info()
print(len(df_tweet))
##clean out any rows with null values

df_tweet.describe()

   Target                                               text
0       4  @gypsy_sunday :O omg oh yes, I always forget a...
1       4  at least i know @sarahtondryk and family will ...
2       0      Yesterday it was sunny and today its raining 
3       4  Going to see Chelsea Art College today.. shoul...
4       0  woken up 'early' 3days in a row and im exhaust...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Target  10000 non-null  int64 
 1   text    10000 non-null  object
dtypes: int64(1), object(1)
memory usage: 156.4+ KB
10000


Unnamed: 0,Target
count,10000.0
mean,1.9944
std,2.000092
min,0.0
25%,0.0
50%,0.0
75%,4.0
max,4.0


In [None]:
#remove punctuation and URLs, and stopwords
def tokenize(text, model=nlp, nostopwds=True,  lemma=False):
   
    tokenlist = []
    doc = model(text)
    ent = ''
    for t in doc:
      
      if nostopwds and t.is_stop:
        #print(t.text)
        continue
      if t.like_url:
        tokenlist.append('URL')
        continue
      if not t.is_alpha:
        continue      
      if lemma:
        #print('lemma',text)
        tokenlist.append(t.lemma_)
      else:
        tokenlist.append(t.lower_)
    return tokenlist


def display_components(model, word_features, top_display=5):
    # utility for displaying respresentative words per component for topic models
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        top_words_idx = topic.argsort()[::-1][:top_display]
        top_words = [word_features[i] for i in top_words_idx]
        print(" ".join(top_words))
text= "Lol, th? oh you got &amp friend for the d?g ?.. U.S. I'm at a  buffet... "
tokenize(text,nostopwds=False)

['lol',
 'th',
 'oh',
 'you',
 'got',
 'amp',
 'friend',
 'for',
 'the',
 'i',
 'at',
 'a',
 'buffet']

In [None]:
tokenlist = [tokenize(str(d)) for d in  df_tweet['text'] ]
print(len(tokenlist))
cv = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
vec_cv = cv.fit_transform(tokenlist).toarray()

10000


In [None]:
print(len(cv.vocabulary_))

12574


In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation
n_components = 20
lda = LatentDirichletAllocation(n_components=n_components)
lda_vecs = lda.fit_transform(vec_cv)

In [None]:

print(type(lda.components_),lda.components_.shape)

print('Topic List')
display_components(lda, cv.get_feature_names())

<class 'numpy.ndarray'> (20, 12574)
Topic List
Topic 0:
day na gon rain today
Topic 1:
love URL cool omg hey
Topic 2:
day bed movie suck tomorrow
Topic 3:
want m bored like URL
Topic 4:
work nice like ya going
Topic 5:
wo outside URL love tonight
Topic 6:
x thank nt working today
Topic 7:
sunday haha URL welcome twitter
Topic 8:
know time great good u
Topic 9:
URL sorry going day got
Topic 10:
good morning know lol right
Topic 11:
sleep looking tired today new
Topic 12:
u nt s love goodnight
Topic 13:
days like amp night sweet
Topic 14:
thanks follow yeah good found
Topic 15:
got love bad home lol
Topic 16:
happy birthday miss wait amp
Topic 17:
sick wish good feel feeling
Topic 18:
fun u like watching time
Topic 19:
URL gone game like head


These are very general conversational tweets

In [None]:
np.argsort(lda_vecs[:, 1])[-5:]
lda_vecs.shape

(10000, 20)

In [None]:
topiclist = []
for topic_idx, topic in enumerate(lda.components_):
     x = 'Topic ' + str(topic_idx)
     topiclist.append(x)   
print(topiclist)

['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3', 'Topic 4', 'Topic 5', 'Topic 6', 'Topic 7', 'Topic 8', 'Topic 9', 'Topic 10', 'Topic 11', 'Topic 12', 'Topic 13', 'Topic 14', 'Topic 15', 'Topic 16', 'Topic 17', 'Topic 18', 'Topic 19']


In [None]:
# Top 10 topics
tweet_topic = pd.DataFrame(lda_vecs, columns=topiclist)
tweet_topic.head()
# what's the highest-weighted per movie, look at the top 10
tweet_topic.idxmax(axis=1).value_counts()[:10]

Topic 10    705
Topic 0     574
Topic 3     572
Topic 9     560
Topic 15    544
Topic 11    538
Topic 8     515
Topic 18    515
Topic 2     511
Topic 14    504
dtype: int64

Topic 10 is most prevalent - good morning know lol right