# Import Data (replies already been excluded)

In [150]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords, wordnet
import pandas as pd

text = pd.read_csv('Jan6(excludes replies).csv')
text2 = pd.read_csv('Frey transcripts.csv')
print('The number of tweets (excludes replies) before data pre-processing:',len(text2))

The number of tweets (excludes replies) before data pre-processing: 6501


In [161]:
text2 = text2.dropna()
# text2 = text2.drop(columns="url")
text2.rename(columns = {'tweets':'transcripts', 'Company':'transcript_name'}, inplace = True)

In [163]:
text2

Unnamed: 0.1,Unnamed: 0,transcripts,date,transcript_name
0,0.0,Scene 4\n\nLars: [00:15:35] 35. [00:15:30]\n\n...,1/5/21 15:59,transcript 1
1,1.0,scene_2.wav\nElizabeth: [00:00:01] This is my ...,12/30/20 23:14,transcript 2
2,2.0,scene_3.wav\nLindsey: [00:00:03] Before I was ...,12/23/20 19:00,transcript 3
3,4.0,Scene 4\n\nLars: [00:15:35] 35. [00:15:30]\n\n...,12/16/20 21:26,transcript 4
4,5.0,Scene 5\n\nElizabeth: [00:21:32] So [00:21:30]...,12/15/20 20:16,transcript 5
5,6.0,Scene 6\n\nMaynard: [00:26:28] I noticed water...,12/10/20 0:24,transcript 6
6,8.0,"Scene 7\n\nLars: [00:28:45] Hey, [00:28:30] gu...",12/8/20 0:13,transcript 7
7,9.0,Scene 8\n\nLindsey: [00:31:10] I [00:31:00] ju...,12/5/20 14:13,transcript 8
8,10.0,Scene 9\n\nLindsey: [00:32:51] Probably. Just ...,12/3/20 23:32,transcript 9
9,11.0,"Scene 10\n\nLindsey: [00:36:15] Oh, yeah.\n\nL...",11/26/20 22:05,transcript 10


In [4]:
from lda import guidedlda

# Data Cleaning
1. tokenization
2. remove @users, hashtag symbols, Urls, and special symbols(i.e., '&amp'), non-alphabetic characters, and words that have less than 3 characters
3. remove stopwords
4. lowercase transformation
5. stemming

In [165]:
import gensim
import gensim.corpora as corpora
import re
porter = PorterStemmer()
stop_words = stopwords.words('english')
b = []
for i,u in text2.iterrows():
    a = []
    word =''
    for words in str(u['transcripts']).split(): #tokenization
        if '@' not in words: #remove @users
            words = words.replace('#','') #remove hashtag symbol
            if '#' not in words:
                if 'http' not in words: #remove URLs
                    if'&amp' not in words: #remove symbol
                        words = re.sub(r'[^a-zA-Z]', ' ', words)#remove non-alphabetic characters
                        if len(words)>2:
                            word += (words+' ')
    doc = ''
    for token in word.split():
        if len(token) >2: # remove words that have less than 3 characters
            token = token.lower()# lowercase form
            if token not in stop_words:# remove stopwords
                token = porter.stem(token) #stemming
                doc += (token+' ')
    b.append(doc)
text2['processed']=[i for i in b]

# exclude tweets that are not in English
non_english_list = ['temiz','rkiy','erik','nda','konu','dan','da','ba','al','viand','para','na','dann','uft','laboratorio','dieser','kalbimi',
                   'restoranda','evento','komo','ind','tica','futuro','sonra','yla','cre','ili','daki',
                   'zaman']
index_axis = []
for index,i in text2.iterrows():
    if len(i['processed']) == 0:
        index_axis.append(index)
    else:
        for word in i['processed'].split():
            if word in non_english_list:
                index_axis.append(index)
                break
text2.drop(text2.index[index_axis],inplace=True)
print("number of tweets after cleaning:",len(text2))

number of tweets after cleaning: 14


In [167]:
text2.groupby('transcript_name')['transcripts'].count().sort_values(ascending=False)

transcript_name
transcript 1     1
transcript 10    1
transcript 11    1
transcript 12    1
transcript 13    1
transcript 14    1
transcript 2     1
transcript 3     1
transcript 4     1
transcript 5     1
transcript 6     1
transcript 7     1
transcript 8     1
transcript 9     1
Name: transcripts, dtype: int64

# Randomly Select 20% Dataset As Our Training Set

In [168]:
text_random_20percent = text2.sample(frac=0.2, random_state=2022)

In [170]:
text_random_20percent.groupby('transcript_name')['transcripts'].count().sort_values(ascending=False)

transcript_name
transcript 10    1
transcript 4     1
transcript 6     1
Name: transcripts, dtype: int64

# Generate Bigrams

In [171]:
from gensim.models import CoherenceModel
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence),deacc=True)) #tokenization，return a list
data_words = list(sent_to_words(text_random_20percent['processed'])) #tokenization
bigram = gensim.models.Phrases(data_words,min_count=1,threshold=1)
'''mincount：int, the times that two unigram co-occur must be equal or higher than this number，
threshold：Phrases function will return a 'phrase score', it will decide whether two unigrams can be regarded as a bigram'''
bigram_mod = gensim.models.phrases.Phraser(bigram) # Bigram
def make_bigrams(texts): #Bigram
    return [bigram[doc] for doc in texts]
data_words_bigrams = make_bigrams(data_words)
id2word = corpora.Dictionary(data_words_bigrams)
texts = data_words_bigrams
corpus = [id2word.doc2bow(text) for text in texts]


def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start,limit,step):
        model=gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics,random_state=2022)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='u_mass')
        coherence_values.append(coherencemodel.get_coherence())
    return model_list, coherence_values
limit=21; start=10; step=1
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=texts, start=start, limit=limit, step=step)
# Show graph
# import matplotlib.pyplot as plt
# x = range(start, limit, step)
# plt.plot(x, coherence_values,label='20% random tweets')
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(loc='best')
# plt.xticks(range(start,limit,step))
# plt.show()

INFO:gensim.models.phrases:collecting all words and their counts
INFO:gensim.models.phrases:PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO:gensim.models.phrases:collected 791 token types (unigram + bigrams) from a corpus of 518 words and 3 sentences
INFO:gensim.models.phrases:merged Phrases<791 vocab, min_count=1, threshold=1, max_vocab_size=40000000>
INFO:gensim.utils:Phrases lifecycle event {'msg': 'built Phrases<791 vocab, min_count=1, threshold=1, max_vocab_size=40000000> in 0.01s', 'datetime': '2022-07-18T19:45:57.546822', 'gensim': '4.2.0', 'python': '3.9.7 (default, Sep 16 2021, 08:50:36) \n[Clang 10.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}
INFO:gensim.models.phrases:exporting phrases from Phrases<791 vocab, min_count=1, threshold=1, max_vocab_size=40000000>
INFO:gensim.utils:FrozenPhrases lifecycle event {'msg': 'exported FrozenPhrases<22 phrases, min_count=1, threshold=1> from Phrases<791 vocab, min_count=1, threshold=1, max_v

# GuidedLDA Model
latent topics are identified from the bigrams shown above.

In [172]:
seed_words = [['god','hope','pray','believ','trust','church','convict','optim','religion','ideaolog','confid','group','relationship',
      'mass','synagogu','anticip','futur','strong','lord','readi','wait','focu','posit','plan','support',
      'desir','accept','loyal','truth','assent','assur','constant','credenc','credul',
      'depend','fealti','relianc','sure','sureti','troth'],#0 Faith
      ['tumor','radiolog','chemotherapi','benign','invas','masectomi','surgeri','malign','metastasi','melanoma',
       'carcigen','cancer','precancer','survivor','diseas','sick','ill','spread','lump','cell','grow','neck','bodi',
       'organ','lung','heal','emerg','oncologist','prescript','m.d.','test','pass','prognosi','migran','seizur','c.t.',
       'scan','imag','heartbeat','diagnosi','biopsi','remov','carcinoma','big c'],#1 Cancer
      ['earth','sun','sky','star','tree','flower','land','sea','ocean','lake','river','rain','storm','thunder',
       'lighten','snow','sunset','sunris','outsid','leaf','trek','wilder','anim','stream','rock','bolt','flood',
       'weather','water','hurrican','morn','night','world','life','environ','landscap','view','cosmo','countri','forest',
       'macrocosm','outdoor','sceneri','seascap',
       'set','univers','natur'],#2 Nature
      ['reflect','care','self car','health','consider','conscienti','regard','thought','thougt','well',
       'well-b','feel','mental health','introspect','meditat','center','concentr','breath','relax','sit','focu',
       'yoga','bodi','hardship','redifin','experi','sign','know','heal',
       'posit','mind','deep','bodi','sens','tension','paus','notic','heartbeat',
       'heart','check','self-esteem','confid','alert','care',
       'concern','direct','forethougt',
       'head','heed','interest','pain','regard']]#3 Mindfullness

In [173]:


from sklearn.feature_extraction.text import CountVectorizer
from lda import guidedlda as guidedlda
import numpy as np
model = guidedlda.GuidedLDA(n_topics=4,n_iter=1000,random_state=2022,refresh=10,alpha=0.01,eta=0.01)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(text2['processed'])

vocab = vectorizer.get_feature_names()

word2id = dict((v,idx) for idx,v in enumerate(vocab))
seed_topics = {}
for t_id, st in enumerate(seed_words):
    for word in st:
        try:
            seed_topics[word2id[word]] = t_id
        except:
            print(word," skipped")

model.fit(X.toarray(),seed_topics=seed_topics,seed_confidence=0.7) #set seed confidence to 0.7
topic_word = model.topic_word_
n_top_words = 20
vocab = tuple(vocab)

for i, topic_dist in enumerate(topic_word): #Print out results
    print('\n')
    print('Topic:',i)
    words_probability = np.array(-topic_dist)
    for index in range(n_top_words):
        print(round(abs(np.sort(words_probability))[:(n_top_words)][index],4),'*',
              np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1][index],sep='',end='  ') 

INFO:lda:n_documents: 14
INFO:lda:vocab_size: 732
INFO:lda:n_words: 2250
INFO:lda:n_topics: 4
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -20833
INFO:lda:<10> log likelihood: -17102
INFO:lda:<20> log likelihood: -16983
INFO:lda:<30> log likelihood: -16908
INFO:lda:<40> log likelihood: -16839
INFO:lda:<50> log likelihood: -16830
INFO:lda:<60> log likelihood: -16765


believ  skipped
trust  skipped
church  skipped
convict  skipped
optim  skipped
religion  skipped
ideaolog  skipped
mass  skipped
synagogu  skipped
anticip  skipped
lord  skipped
accept  skipped
loyal  skipped
truth  skipped
assent  skipped
assur  skipped
constant  skipped
credenc  skipped
credul  skipped
depend  skipped
fealti  skipped
relianc  skipped
sure  skipped
sureti  skipped
troth  skipped
radiolog  skipped
benign  skipped
invas  skipped
masectomi  skipped
malign  skipped
metastasi  skipped
melanoma  skipped
carcigen  skipped
precancer  skipped
survivor  skipped
diseas  skipped
sick  skipped
ill  skipped
spread  skipped
lump  skipped
cell  skipped
organ  skipped
heal  skipped
m.d.  skipped
migran  skipped
c.t.  skipped
carcinoma  skipped
big c  skipped
sky  skipped
tree  skipped
sea  skipped
ocean  skipped
lake  skipped
storm  skipped
thunder  skipped
lighten  skipped
snow  skipped
sunset  skipped
leaf  skipped
trek  skipped
wilder  skipped
anim  skipped
stream  skipped
bolt  sk

INFO:lda:<70> log likelihood: -16716
INFO:lda:<80> log likelihood: -16690
INFO:lda:<90> log likelihood: -16729
INFO:lda:<100> log likelihood: -16693
INFO:lda:<110> log likelihood: -16721
INFO:lda:<120> log likelihood: -16665
INFO:lda:<130> log likelihood: -16581
INFO:lda:<140> log likelihood: -16576
INFO:lda:<150> log likelihood: -16526
INFO:lda:<160> log likelihood: -16543
INFO:lda:<170> log likelihood: -16424
INFO:lda:<180> log likelihood: -16445
INFO:lda:<190> log likelihood: -16427
INFO:lda:<200> log likelihood: -16490
INFO:lda:<210> log likelihood: -16416
INFO:lda:<220> log likelihood: -16406
INFO:lda:<230> log likelihood: -16360
INFO:lda:<240> log likelihood: -16357
INFO:lda:<250> log likelihood: -16391
INFO:lda:<260> log likelihood: -16384
INFO:lda:<270> log likelihood: -16371
INFO:lda:<280> log likelihood: -16357
INFO:lda:<290> log likelihood: -16377
INFO:lda:<300> log likelihood: -16385
INFO:lda:<310> log likelihood: -16347
INFO:lda:<320> log likelihood: -16319
INFO:lda:<330> 



Topic: 0
0.0631*right  0.057*yeah  0.0306*guid  0.0285*lar  0.0285*phil  0.0285*good  0.0285*come  0.0265*go  0.0244*guy  0.0224*one  0.0183*first  0.0163*okay  0.0143*put  0.0122*hope  0.0122*well  0.0122*let  0.0122*spot  0.0102*god  0.0102*start  0.0102*look  

Topic: 1
0.0581*elizabeth  0.0291*breath  0.0218*kid  0.017*take  0.017*part  0.017*move  0.0145*call  0.0145*justin  0.0121*huge  0.0121*nurs  0.0121*agenc  0.0121*week  0.0121*adopt  0.0121*kind  0.0097*last  0.0097*help  0.0097*sam  0.0097*keep  0.0097*saw  0.0097*long  

Topic: 2
0.0335*know  0.0287*like  0.0192*get  0.0192*year  0.0192*think  0.018*cancer  0.018*deni  0.0168*one  0.0144*life  0.0132*two  0.0132*anoth  0.0132*back  0.012*four  0.012*feel  0.012*lot  0.012*lar  0.012*scene  0.012*thing  0.0108*could  0.0108*diagnos  

Topic: 3
0.0519*lindsey  0.039*realli  0.0315*see  0.0278*would  0.026*like  0.0167*talk  0.0167*roy  0.0149*come  0.0149*peopl  0.0149*love  0.0149*mirror  0.0149*give  0.013*chang  0.013*

In [116]:
# import lda.datasets as gldad
# Y = gldad.load_reuters()
# vocab_y = gldad.load_reuters_vocab()

# word2id_y = dict((z, idy) for idy, z in enumerate(vocab_y))
# print(Y[:10])

# Y = guidedlda.datasets.load_data(guidedlda.datasets.NYT)
# vocab_y = guidedlda.datasets.load_vocab(guidedlda.datasets.NYT)

In [124]:
# type(vocab_y)

In [125]:
# type(model)

In [115]:
# for t_id, st in enumerate(bigramseed):
#     print(t_id,st)

In [114]:
# print(X)

In [113]:
# for t_id, st in enumerate(bigramseed):
#     for word in st:
#         try:
#             seed_topics[word2id[word]] = t_id
#             print(seed_topics)
#         except:
#             print(word, "skipped")

In [112]:
# for idx, v in enumerate(vocab):
#     print("index is %d and value is %s" % (idx, v))

In [126]:
# print (list(enumerate(bigramseed)))

# Proportion of Each Topic

In [174]:
topic_number = []
number = []
topic_probability = []
for i in range(len(doc_topic)):
    topic_number.append(doc_topic[i].argmax())
    topic_probability.append(doc_topic[i][doc_topic[i].argmax()])
    number.append('1')
data = pd.DataFrame(data=[i for i in topic_number],columns=['topic_number'])
data['number'] = [i for i in number]
# number_of_tweets = pd.DataFrame(data.groupby('topic_number')['number'].count())
# number_of_tweets['proportion'] = [str(round(i/len(text)*100,2))+'%' for i in number_of_tweets['number']]
# number_of_tweets['Topic'] = ['Manufacturing process','Seafood','Meat product','Sustainability',
#                                  'Alternative protein','Animal welfare','Health and nutrition','Industry and market',
#                                  'Fundraising','Event promotion and media release','Hiring information',
#                                  'Regulation','Unseeded topic1','Unseeded topic2']
# number_of_tweets

# Company-Topic Heatmap
Based on proportion of each topic for each company

In [175]:
doc_topic = model.transform(X)
topic_number1 = []


for i in range(len(doc_topic)):
    topic_number1.append(doc_topic[i].argmax())
text2['topic number'] = [i for i in topic_number1]
# topic_author = text2.groupby(['topic number','Company'])['tweets'].count()
# topic_author_3d = topic_author.unstack()
# topic_author_3d = topic_author_3d[['Memphis Meats','biftek.co 🔬👩‍🔬🐄🥗','Aleph Farms','SuperMeat',
#                                   'Finless Foods','shiokmeats','BlueNalu','New Age Meats','CUBIQ FOODS',
#                                   'Mosa Meat','Wildtype','Meatable','Future Fields','Vow',
#                                   'FutureMeat','Balletic Foods','LabFarmFoods','Avant Meats','Mission Barns']]
# topic = ['Manufacturing process and supplies','Seafood product','Meat product','Sustainability',
#         'Animal welfare','Alternative protein','Health and nutrition','Regulation','Industry and market','Fundraising',
#         'Hiring Information','Event promotion and media release','Unseeded topic 1','Unseeded topic 2']

# company = ['UPSIDE Foods','Biftek.co','Aleph Farms','SuperMeat','Fineless Foods','Shiok Meats','BlueNalu',
#           'New Age Meats','Cubiq Foods','Mosa Meat','Wild Type','Meatable','Future Fields','Vow','Future Meat',
#            'Balletic Foods','Lab Farm Foods','Avant Meats','Mission Barns']
# topic_author_3d = topic_author_3d.fillna(0)#replace NaN by 0
# topic_author_3d = topic_author_3d.reindex([0,1,2,3,5,4,6,11,7,8,10,9,12,13])# reindex
# import matplotlib.pyplot as plt
# plt.figure(figsize=(11,8))
# plt.imshow(topic_author_3d.div(topic_author_3d.sum(axis=0),axis=1),cmap="Blues")
# plt.colorbar().ax.set_ylabel('Proportion of each topic for each company')
# plt.xticks(range(len(company)), company,rotation=90)
# plt.yticks(range(len(topic)), topic)
# plt.xlabel('Company')
# plt.ylabel('Topic')
# plt.show()

In [127]:
# topic_number1

In [176]:
text2

Unnamed: 0.1,Unnamed: 0,transcripts,date,transcript_name,processed,topic number
0,0.0,Scene 4\n\nLars: [00:15:35] 35. [00:15:30]\n\n...,1/5/21 15:59,transcript 1,scene lar lar know bag everyth yeah yeah one t...,2
1,1.0,scene_2.wav\nElizabeth: [00:00:01] This is my ...,12/30/20 23:14,transcript 2,scene wav elizabeth case run realli fast away ...,1
2,2.0,scene_3.wav\nLindsey: [00:00:03] Before I was ...,12/23/20 19:00,transcript 3,scene wav lindsey diagnos still colleg found t...,1
3,4.0,Scene 4\n\nLars: [00:15:35] 35. [00:15:30]\n\n...,12/16/20 21:26,transcript 4,scene lar lar know bag everyth yeah yeah one t...,2
4,5.0,Scene 5\n\nElizabeth: [00:21:32] So [00:21:30]...,12/15/20 20:16,transcript 5,scene elizabeth saw hematologist oncologist fi...,0
5,6.0,Scene 6\n\nMaynard: [00:26:28] I noticed water...,12/10/20 0:24,transcript 6,scene maynard notic water reflect light onto s...,3
6,8.0,"Scene 7\n\nLars: [00:28:45] Hey, [00:28:30] gu...",12/8/20 0:13,transcript 7,scene lar hey guy good job welcom camp right r...,0
7,9.0,Scene 8\n\nLindsey: [00:31:10] I [00:31:00] ju...,12/5/20 14:13,transcript 8,scene lindsey see make feel big yet small roy ...,3
8,10.0,Scene 9\n\nLindsey: [00:32:51] Probably. Just ...,12/3/20 23:32,transcript 9,scene lindsey probabl tri build fire get done ...,3
9,11.0,"Scene 10\n\nLindsey: [00:36:15] Oh, yeah.\n\nL...",11/26/20 22:05,transcript 10,scene lindsey yeah lindsey went trip septemb s...,3
