In [5]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer


# Step 1 - Extract embeddings
#embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
#embedding_model = SentenceTransformer("vinai/bertweet-base")
embedding_model = SentenceTransformer('digio/Twitter4SSE')


No sentence-transformers model found with name /Users/jpcointet/.cache/torch/sentence_transformers/digio_Twitter4SSE. Creating a new one with MEAN pooling.


In [6]:
import pandas as pd


In [7]:
dtype_dict={}
for x in ['user_id', 'retweeted_id', 'retweeted_user_id', 'id']:
    dtype_dict[x]=str


In [8]:
df=pd.read_csv('../data/tweets_of_oncologist_and_advocacy.csv',dtype=dtype_dict)

In [9]:
dico_user_id_2_status=dict(zip(df.user_id,df.User_status))

In [10]:
df['retweeted_user_status']=df['retweeted_user_id'].map(dico_user_id_2_status.get)

In [11]:
df['retweeted_user_status'].value_counts()

Oncologist    206305
Advocacy       81894
Name: retweeted_user_status, dtype: int64

In [12]:
df['retweeted_id'].isna().sum()/len(df)

0.6091005822333654

In [13]:
df['User_status'].value_counts()

Oncologist    2208196
Advocacy      1719267
Name: User_status, dtype: int64

In [14]:
df_adv=df[df['User_status']=='Advocacy']#.sample(50000)
df_onc=df[df['User_status']=='Oncologist']#.sample(50000)

In [15]:
#df_adv.retweeted_user_status.value_counts()

In [16]:
df[['User_status','retweeted_user_status']]

Unnamed: 0,User_status,retweeted_user_status
0,Advocacy,
1,Advocacy,
2,Advocacy,
3,Advocacy,Oncologist
4,Advocacy,
...,...,...
3927458,Advocacy,
3927459,Advocacy,
3927460,Advocacy,
3927461,Advocacy,


In [17]:
print(len(df[(df['User_status']=='Advocacy')]))
df_adv=df[(df['User_status']=='Advocacy')].sample(100000,random_state=23)# | (df['retweeted_user_status']=='Advocacy')].sample(50000)


1719267


In [18]:
df_adv=df_adv[df_adv.retweeted_user_id.isna()].sample(50000,random_state=23)

In [19]:
df_adv.User_status.value_counts()

Advocacy    50000
Name: User_status, dtype: int64

In [20]:
onc_retweeted_ids=df[(df['retweeted_user_status']=='Advocacy') & (df['User_status']=='Oncologist')]['retweeted_id'].values


In [21]:
len(df_onc),len(onc_retweeted_ids)

(2208196, 20894)

In [22]:
dico_id={}
for x in onc_retweeted_ids:
    dico_id[x]=True

In [23]:
df_onc.id

576727     1624476782610055170
576728     1624373977400459265
576729     1624080441094426624
576730     1623639743287230465
576731     1619359531527704582
                  ...         
3538700     773612485198831616
3538701     773458402173394944
3538702     773456920086777856
3538703     773456485783375873
3538704     773455874866151425
Name: id, Length: 2208196, dtype: object

In [24]:
df_adv[df_adv['id'].isin(onc_retweeted_ids)]

Unnamed: 0,user_id,user_screen_name,text,User_status,retweeted_id,retweeted_user_id,user_description,sum_nb_biom,id,timestamp_utc,...,EXON,EGFR,KRAS,NTRK,BRAF,MET,RET,HER2,nb_of_biomarker,retweeted_user_status
3892201,49416874,JFreemanDaily,Reminder to #ASCO22 tweeps: pls include #lcsm ...,Advocacy,,,"Writer, speaker, science geek. Lung cancer pat...",414.0,1533088158849462282,1654351675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3873873,14262217,stales,cc: @jodyms @drattai we are official now!!! (@...,Advocacy,,,"Co-founder of #BCSM, POWER TWEETER, eternal op...",289.0,253178426947690496,1349197415,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2902597,73436353,itsnot_pink,Now THIS would make more sense! « ogcancerpati...,Advocacy,,,She/Her. Living w #MetastaticBC since '13 & ad...,11.0,1395088550253711372,1621450004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2879227,32530883,MyelomaTeacher,Follow @MMSMChats for updates on upcoming #Mye...,Advocacy,,,Retired Teacher- Dx with Myeloma 2008- Patient...,10.0,435998673021833216,1392785158,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3563770,24341364,bonniejaddario,A very successful Living Room in Mexico! Hoste...,Advocacy,,,"Activist, advocate, educator and change agent ...",53.0,1184931492725583872,1571344657,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3893285,49416874,JFreemanDaily,Targeted Therapies in Lung Cancer (#TTLC20) 20...,Advocacy,,,"Writer, speaker, science geek. Lung cancer pat...",414.0,1230478945331666946,1582204016,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
2875697,32530883,MyelomaTeacher,Listen to @theMMRF Highlights of #ASH14 teleco...,Advocacy,,,Retired Teacher- Dx with Myeloma 2008- Patient...,10.0,562991600184619010,1423062632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3065634,588584827,1111linno,And I was that patient who connected with @DrM...,Advocacy,,,"Animal, activist, artist. Becoming a revolutio...",18.0,887858003965132800,1500516818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
3921312,793207603291500544,ivybelkins,"Dave Messi, @IASLC CEO, talks about the #LungA...",Advocacy,,,"Co-Founder, @EGFRResisters LC Patient Group; w...",841.0,1170392449048621064,1567878279,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,


In [25]:
df_adv['star']=df_adv.id.map(dico_id.get)

In [26]:
df_adv.star.value_counts()

True    772
Name: star, dtype: int64

In [27]:
len(df_adv)

50000

In [58]:
df_adv=df_adv.sample(10000,random_state=23)

In [59]:
df_adv_s=df_adv.drop_duplicates(subset=['text'])
len(df_adv_s)

9991

In [60]:
dfl=df.sample(4000,random_state=23)

In [61]:
from bertopic import BERTopic


In [62]:
import cleantext
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords 
# create a function for the tweet tokenizer from NLTK
def tok(text):
    tt = TweetTokenizer()
    return [x for x in tt.tokenize(re.sub("@[A-Za-z0-9_]+","", cleantext.replace_urls(text, replace_with=""))) if len(x)>2 and len(x)<30]


In [63]:
import re
#REMOVE ONLY THE '#'NOT THE WORD AFTER
def remove_hashtag_sign(text):
    text = re.sub(r'#', '', text)
    return text

#dfl['text_clean'] = dfl['text'].apply(lambda x:remove_hashtag_sign(x))
#Remove extra white spaces, punctuation and apply lower casing
#dfl['text_clean'] = dfl['text_clean'].str.lower().str.replace('[^\w\s]',' ').str.replace('\s\s+', ' ')
dfl['text_clean']=dfl['text'].apply(tok)
dfl.head()



Unnamed: 0,user_id,user_screen_name,text,User_status,retweeted_id,retweeted_user_id,user_description,sum_nb_biom,id,timestamp_utc,...,EGFR,KRAS,NTRK,BRAF,MET,RET,HER2,nb_of_biomarker,retweeted_user_status,text_clean
3315684,3281500212,BreastAdvocate,Are breast self exams still worth my time? ht...,Advocacy,,,Free #BreastCancer #Surgery #SharedDecisionMak...,32.0,1611467691432214528,1673038811,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"[Are, breast, self, exams, still, worth, time,..."
932198,320434896,IanJPereira,RT @amcunningham: @waisunchan no gender or soc...,Oncologist,4.611681628309545e+17,14925532.0,Resident #RadOnc | World Med Association Junio...,3.0,461168766404276224,1398786176,...,,,,,,,,,,"[gender, socio, economic, differences, PQA, pe..."
3252467,23389575,jodyms,"Safe travels Liza Bernstein @itsthebunk Hi, wo...",Advocacy,,,"Writer, blogger. Optimist. Cancer Advocate. Wo...",28.0,367089272546787328,1376355877,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"[Safe, travels, Liza, Bernstein, wonderful, #b..."
3648398,904861723,abcdiagnosis,Well done Charlie! Hope all went well 😊🚴🏼@pati...,Advocacy,,,Founder of After Breast Cancer Diagnosis & liv...,60.0,754949405090910208,1468828939,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"[Well, done, Charlie, Hope, all, went, well]"
3785501,54608984,bjork5,@THEACTUALDANCE happy birthday Sam!,Advocacy,,,Patient Advocate. Lung Cancer Survivor. Resear...,107.0,623315344254476288,1437444933,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,"[happy, birthday, Sam]"


In [64]:
dfl['len']=dfl.text_clean.apply(len)

In [65]:
dfl=dfl[dfl.len>=9]
len(dfl)

2959

In [66]:
#df_adv.text.values

In [67]:
df_adv_s=dfl.dropna(subset='text_clean')#df_adv_s.dropna(subset='text')
corpus= list(df_adv_s.text.values)

len(corpus)

2959

In [68]:
#df_adv_s_export=df_adv_s[df_adv_s['text'].isin(docs)]

In [69]:
#df_adv_s_export.to_csv('sample_10k_text_espadon_advocacy.csv')
df_adv_s.to_csv('sample_5k_text_espadon_advocacy_.csv')

In [70]:
dico_doc_2_author=dict(zip(df_adv_s.text,df_adv_s.user_screen_name))

In [71]:
#topic_model = BERTopic(embedding_model=embedding_model)
#topic_model.fit_transform(corpus[:])
len(corpus)

2959

In [88]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stopwords = stopwords.words('english') 

# Define sub-models
vectorizer = CountVectorizer(ngram_range=(1,1),stop_words=stopwords)#,vocabulary=vocab_filtered)
#tfidf = TfidfVectorizer(vocabulary=vocab_filtered)
#tfidf = TfidfVectorizer(vocabulary=vocab_filtered, ngram_range=(1,2))#, tokenizer=textblob_tokenizer)
#tfidf = TfidfVectorizer( ngram_range=(1,2))

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=12, min_samples=2, metric='euclidean', prediction_data=True,cluster_selection_method='eom')
#cluster_model = KMeans(n_clusters=150)

# Train our topic model with BERTopic

In [73]:
#docs=corpus[:10000]
docs=corpus

In [74]:

#trump.text = trump.apply(lambda row: re.sub(r"http\S+", "", row.text).lower(), 1)
#trump.text = trump.apply(lambda row: " ".join(filter(lambda x:x[0]!="@", row.text.split())), 1)
#trump.text = trump.apply(lambda row: " ".join(re.sub("[^a-zA-Z]+", " ", row.text).split()), 1)


In [75]:
embeddings = embedding_model.encode(docs,show_progress_bar=True)


Batches:   0%|          | 0/93 [00:00<?, ?it/s]

In [76]:
#stsb-distilbert-base

In [89]:
vectorizer = CountVectorizer(ngram_range=(1,2))
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer).fit(docs, embeddings)

In [90]:
#from scipy.cluster import hierarchy as sch
#linkage_function = lambda x: sch.linkage(x, 'single')#, optimal_ordering=True)
#hierarchical_topics = topic_model.hierarchical_topics(docs,linkage_function=linkage_function)
#topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics,linkage_function=linkage_function)#,topics=topic_model.topics_)


In [91]:
#topic_model.reduce_topics(docs, nr_topics=100)#"auto")


In [92]:
legend=topic_model.get_topic_info()
legend

Unnamed: 0,Topic,Count,Name
0,-1,964,-1_the_https_to_com
1,0,146,0_in_of_http_al
2,1,112,1_de_la_el_que
3,2,102,2_bcsm_to_hcldr_it
4,3,99,3_chat_join_gyncsm_patientchat
...,...,...,...
61,60,12,60_teampatchett_non fiction_fiction_they don
62,61,12,61_ick_travel_000 hrs_ick ick
63,62,12,62_doctorow_nature_status 1388616913211695109_...
64,63,12,63_dustinpenner25_was_markwarner_1495473199018...


In [94]:
 
vectorizer_model2 = CountVectorizer(ngram_range=(1,2),tokenizer=tok,stop_words=stopwords,min_df=10)
topic_model.update_topics(docs, vectorizer_model=vectorizer_model2)

In [95]:
# Generate nicer looking labels and set them in our model
topic_labels = topic_model.generate_topic_labels(nr_words=10,
                                                 topic_prefix=True,
                                                 word_length=20,
                                                 separator=", ")
topic_model.set_topic_labels(topic_labels)

In [96]:

legend=topic_model.get_topic_info()
legend

Unnamed: 0,Topic,Count,Name,CustomName
0,-1,964,-1_cancer_patients_research_...,"-1, cancer, patients, research, ..., risk, #ca..."
1,0,146,0_#mmsm_multiple_#myeloma_myeloma,"0, #mmsm, multiple, #myeloma, myeloma, study, ..."
2,1,112,1_per_via_hospital_...,"1, per, via, hospital, ..., 2021, #covid19, re..."
3,2,102,2_#bcsm_young_need_better,"2, #bcsm, young, need, better, would, diagnosi..."
4,3,99,3_join_#bcsm_twitter_learn,"3, join, #bcsm, twitter, learn, tomorrow, comm..."
...,...,...,...,...
61,60,12,60_hard_find_world_read,"60, hard, find, world, read, share, people, ev..."
62,61,12,61_one_guidelines_absolutely_trying,"61, one, guidelines, absolutely, trying, would..."
63,62,12,62_change_guidelines_hey_bad,"62, change, guidelines, hey, bad, whether, bas..."
64,63,12,63_looks_met_news_question,"63, looks, met, news, question, said, let, twi..."


In [97]:
topicnb_2_name=dict(zip(legend['Topic'],legend['CustomName']))


In [98]:
fig=topic_model.visualize_barchart(topics=list(range(len(topic_model.get_topic_info())-1)),n_words=10)
fig.write_html('../image/barcharts'+str(len(topic_model.get_topic_info()))+'.html')

In [99]:
nb_topic=len(legend)

In [100]:
# Generate nicer looking labels and set them in our model
topic_labels = topic_model.generate_topic_labels(nr_words=10,
                                                 topic_prefix=True,
                                                 word_length=20,
                                                 separator=", ")
topic_model.set_topic_labels(topic_labels)

In [101]:
topic_labels

['-1, cancer, patients, research, ..., risk, #cancer, thanks, care, patient, people',
 '0, #mmsm, multiple, #myeloma, myeloma, study, inhibitors, cell, therapy, chemo, trial',
 '1, per, via, hospital, ..., 2021, #covid19, real, big, happy, years',
 '2, #bcsm, young, need, better, would, diagnosis, support, understand, help, think',
 '3, join, #bcsm, twitter, learn, tomorrow, community, breast cancer, breast, #mmsm, follow',
 '4, trial, phase, paper, pts, nsclc, results, clinical, response, molecular, #lcsm',
 '5, breast, breast cancer, #breastcancer, #bcsm, cancer, women, day, via, diagnosis, metastatic',
 '6, proud, work, thank, excited, stop, happy, never, team, new, women',
 '7, #lungcancer, #lcsm, support, lung cancer, thank, lung, research, community, able, month',
 '8, looking forward, forward, colleagues, 2020, looking, open, online, meeting, please, free',
 '9, care, access, health, medical, healthcare, change, hospital, ask, part, colleagues',
 '10, drug, effective, cost, case

In [102]:
info_doc=topic_model.get_document_info(docs)
dico_doc_2_topic=dict(zip(info_doc.Document,info_doc.Topic))

In [103]:
#dico_doc_2_topic

In [104]:
#info_doc3=info_doc[info_doc['Topic']==3]
#info_doc3[info_doc3.Document.str.contains('@katie23085')].Document.values

In [105]:
import textwrap# Manually selected some interesting topics to prevent information overload
topics_of_interest = list(range(nb_topic))

def add_br(example,N=100):
    return '<br>'.join(l for line in example.splitlines() 
                  for l in textwrap.wrap(line, width=N))
    
    

adjusted_docs = ["<b>"+"Topic:"+str(dico_doc_2_topic[doc])+" - " + topicnb_2_name[dico_doc_2_topic[doc]] +"</b><br>" + add_br(doc[:]) +"<br><b>by " + dico_doc_2_author[doc]+"</b>" 
                 for doc in docs]

# Visualize documents
fig=topic_model.visualize_documents(
    adjusted_docs, 
    embeddings=embeddings, 
    hide_annotations=True, 
    topics=topics_of_interest,
    custom_labels=True,
    width = 2000,
    height = 1300
)
fig.write_html('../image/embedding_'+str(nb_topic)+'s.html')

In [106]:
df_adv_s['text_clean']

3315684    [Are, breast, self, exams, still, worth, time,...
3252467    [Safe, travels, Liza, Bernstein, wonderful, #b...
165571     [Powerful, opening, remarks, introducing, this...
1178174    [Many, have, asked, whether, this, week, chang...
2331945    [join, #mmsm, Twitter, chat, can, search, #mms...
                                 ...                        
1169794    [This, interesting, idea, the, Crick, Institut...
3288673    [Look, who, will, coming, for, Here, Voice, Se...
1237356    [#RESILIENCEnews, The, new, Clinical, Practice...
3204522    [Welcoming, participation, #bcsm, chat, helped...
1884658    [Therapeutic, approaches, for, the, management...
Name: text_clean, Length: 2959, dtype: object

In [107]:
dfl['topic_nb']=dfl['text'].map(dico_doc_2_topic.get)
dfl['topic_label']=dfl['topic_nb'].map(topicnb_2_name.get)

In [139]:
documentsm1=list(dfl[dfl.topic_nb!=-1][['topic_label','user_screen_name','User_status','text']].transpose().to_dict().values())
#documents

In [136]:
dfl['topic_nb']

3315684     5
3252467     3
165571     -1
1178174    16
2331945     3
           ..
1169794    -1
3288673     3
1237356    -1
3204522     3
1884658    -1
Name: topic_nb, Length: 2959, dtype: int64

In [137]:
from nomic import atlas


In [134]:
embedm1=embeddings[dfl.topic_nb!=-1]
#documentsm1=documents[dfl.topic_nb!=-1]

TypeError: list indices must be integers or slices, not Series

In [141]:
response = atlas.map_embeddings(embeddings=embedm1,
                                data=documentsm1,
                                colorable_fields=['topic_label'],
                                name="older#sm",add_datums_if_exists = True, build_topic_model=False)
                                #description="An example of building a text map with a huggingface model.")

print(response)

[32m2023-06-04 19:53:11.495[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `older#sm` in organization `jeanphilippe.cointet`[0m
[32m2023-06-04 19:53:13.097[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m100[0m - [1mUploading embeddings to Atlas.[0m
1it [00:04,  4.47s/it]
[32m2023-06-04 19:53:17.597[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-06-04 19:53:17.598[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_embeddings[0m:[36m119[0m - [1mEmbedding upload succeeded.[0m
[32m2023-06-04 19:53:19.073[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `older#sm` in project `older#sm`: https://atlas.nomic.ai/map/4234dbdf-9cf8-4827-9e99-e7c1ac7d5924/91543781-e817-4701-85c2-eb7e946907f0[0m
[32m2023-06-04 19:53:19.076[0m | [1mINFO    [0m | [36mnomic.atlas[0m

older#sm: https://atlas.nomic.ai/map/4234dbdf-9cf8-4827-9e99-e7c1ac7d5924/91543781-e817-4701-85c2-eb7e946907f0


In [115]:
#build_topic_model

In [116]:
response = atlas.map_text(#embeddings=embeddings,
                                data=documents,
                                indexed_field='text',
                                colorable_fields=['topic_label'],
                                multilingual=True,
                                name="newest#sm#text",add_datums_if_exists = True, build_topic_model=True)
                                #description="An example of building a text map with a huggingface model.")

print(response)

[32m2023-06-04 19:31:31.737[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_create_project[0m:[36m965[0m - [1mCreating project `newest#sm#text` in organization `jeanphilippe.cointet`[0m
[32m2023-06-04 19:31:37.743[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_text[0m:[36m212[0m - [1mUploading text to Atlas.[0m
1it [00:07,  7.51s/it]
[32m2023-06-04 19:31:45.286[0m | [1mINFO    [0m | [36mnomic.project[0m:[36m_add_data[0m:[36m1577[0m - [1mUpload succeeded.[0m
[32m2023-06-04 19:31:45.287[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_text[0m:[36m228[0m - [1mText upload succeeded.[0m
[32m2023-06-04 19:31:47.127[0m | [1mINFO    [0m | [36mnomic.project[0m:[36mcreate_index[0m:[36m1282[0m - [1mCreated map `newest#sm#text` in project `newest#sm#text`: https://atlas.nomic.ai/map/a6ffda81-1175-4add-9ca6-ae56ac628327/f80b5bf5-c79d-4212-ac3b-e5df7f037ac0[0m
[32m2023-06-04 19:31:47.129[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36

newest#sm#text: https://atlas.nomic.ai/map/a6ffda81-1175-4add-9ca6-ae56ac628327/f80b5bf5-c79d-4212-ac3b-e5df7f037ac0


In [127]:
response.

topic_label: string
user_screen_name: string
User_status: string
text: string
id_: string