In [None]:
from bertopic import BERTopic
import pandas as pd
import csv
import re
import string
import datetime
import scipy
import numpy
from scipy import sparse
import sys   
import unicodedata
import nltk 
import numpy as np   
import hdbscan
import time     
from scipy.sparse import csr_matrix, csc_matrix 
from umap import UMAP
from IPython.display import clear_output

In [None]:
from flair.embeddings import TransformerDocumentEmbeddings
roberta = TransformerDocumentEmbeddings('roberta-base')

In [None]:
def text_clean(x):

    ### Light
    x = x.lower() # lowercase everything
    x = x.encode('ascii', 'ignore').decode()  # remove unicode characters
    x = re.sub(r'https*\S+', ' ', x) # remove links
    x = re.sub(r'http*\S+', ' ', x)
    # cleaning up text
    x = re.sub(r'\'\w+', '', x) 
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    x = re.sub(r'\s[^\w\s]\s', '', x)
    
    ### Heavy
    x = re.sub(r'@\S', '', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
    # remove single letters and numbers surrounded by space
    x = re.sub(r'\s[a-z]\s|\s[0-9]\s', ' ', x)

    return x

In [None]:
train = pd.read_csv('bert_russia_train_062222.csv', encoding = "ISO-8859-1", engine='python')
train.dropna(subset=['text'])
nan_value = float("NaN")
train.replace("", nan_value, inplace=True)
train.dropna(subset = ["text"], inplace=True)
train.replace(" ", nan_value, inplace=True)
train.dropna(subset = ["text"], inplace=True)
train.drop(columns=['Unnamed: 0', 'index'], axis=1, inplace=True)
train.info()
train.head()

In [None]:
#trainsample = train.sample(frac=0.01, replace=True, random_state=1)
#nan_value = float("NaN")
#trainsample.replace(" ", nan_value, inplace=True)
#trainsample.dropna(subset = ["text"], inplace=True)
#trainsample.info()
#trainsample.head()

In [None]:
train['cleaned_text'] = train.text.apply(text_clean)
traintext = train.cleaned_text.to_list()

In [None]:
start_time = time.time()
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42) # to fix the bertopic for replication
topic_model = BERTopic(umap_model=umap_model, embedding_model=roberta, nr_topics="auto", calculate_probabilities = True).fit(traintext)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
probs = hdbscan.all_points_membership_vectors(topic_model.hdbscan_model)
probs = topic_model._map_probabilities(probs, original_topics=True)
topics, probs = topic_model.fit_transform(traintext)
df = pd.DataFrame(probs)
topic_model.save("Bert_Model_Outputs/modelrussiaroberta_v1_091322") 
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
df.to_csv("russia_state_ukr_topics_roberta_probs_091322.csv")

In [None]:
docs = topic_model.get_representative_docs()
freq = topic_model.get_topic_info()
doc_panda = pd.DataFrame(list(docs.items()),columns = ['Topic','Docs']) 

In [None]:
freq.to_csv("russia_state_ukr_topics_roberta_v1_091322.csv")
doc_panda.to_csv("russia_state_ukr_topics_roberta_representative_docs_v1_091322.csv")

In [None]:
freq

In [None]:
topics, probs = topic_model.fit_transform(traintext)
df_topics = pd.DataFrame({'topic': topics, 'document': traintext})
df_topics.info()
df_topics.to_csv("russia_state_ukr_doc_topics_v1_091322.csv")

In [None]:
len(df_topics['topic'].unique().tolist())

In [None]:
df_topics['topiclabel'] = df_topics['topic'] 
df_topics.info()

In [None]:
dict1=freq.set_index('Topic').to_dict()['Name']
df_topics=df_topics.replace({"topiclabel": dict1})
df_topics = df_topics[['topic', 'topiclabel', 'document']]
df_topics.head()

In [None]:
df_topics.to_csv("russia_state_ukr_doc_topics_v1_091322.csv")

In [None]:
df_topics_outlier = df_topics[df_topics.topic == -1]
df_topics_outlier.info()
len(df_topics_outlier)

In [None]:
df_topics_outlier.to_csv("russia_state_ukr_outliers_v2_091322.csv")

In [None]:
#clear output

In [None]:
df_topics_outlier=pd.read_csv('russia_state_ukr_outliers_v2_091322.csv')
df_topics_outlier.dropna(subset=['document'])
nan_value = float("NaN")
df_topics_outlier.replace("", nan_value, inplace=True)
df_topics_outlier.dropna(subset = ["document"], inplace=True)
df_topics_outlier.replace(" ", nan_value, inplace=True)
df_topics_outlier.dropna(subset = ["document"], inplace=True)
df_topics_outlier.info()
df_topics_outlier.head()

In [None]:
df_topics_outlier['cleaned_text'] = df_topics_outlier.document.apply(text_clean)
df_topics_outlier_text = df_topics_outlier.cleaned_text.to_list()
len(df_topics_outlier_text)

In [None]:
start_time = time.time()
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)
topic_model2 = BERTopic(umap_model=umap_model, embedding_model=roberta, nr_topics="auto", calculate_probabilities = True).fit(df_topics_outlier_text)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
probs2 = hdbscan.all_points_membership_vectors(topic_model2.hdbscan_model)
probs2 = topic_model2._map_probabilities(probs2, original_topics=True)
topics, probs = topic_model2.fit_transform(df_topics_outlier_text)
df2 = pd.DataFrame(probs2)
topic_model2.save("Bert_Model_Outputs/modelrussiaroberta_v2_091322") 
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
df2.to_csv("russia_state_ukr_topics_roberta_probs_v2_091322.csv")

In [None]:
docs2 = topic_model2.get_representative_docs()
freq2 = topic_model2.get_topic_info()
doc_panda2 = pd.DataFrame(list(docs2.items()),columns = ['Topic','Docs']) 

In [None]:
freq2.to_csv("russia_state_ukr_topics_roberta_v2_091322.csv")
doc_panda2.to_csv("russia_state_ukr_topics_roberta_representative_docs_v2_091322.csv")

In [None]:
freq2

In [None]:
df_topics2 = pd.DataFrame({'topic': topics, 'document': df_topics_outlier_text})
df_topics2.to_csv("russia_state_ukr_doc_topics_v2_091322.csv")

In [None]:
len(df_topics2['topic'].unique().tolist())

In [None]:
df_topics2['topiclabel'] = df_topics2['topic'] 
df_topics2.head()

In [None]:
dict2=freq2.set_index('Topic').to_dict()['Name']
df_topics2=df_topics2.replace({"topiclabel": dict2})
df_topics2 = df_topics2[['topic', 'topiclabel', 'document']]
df_topics2.head()

In [None]:
df_topics2.to_csv("russia_state_ukr_doc_topics_v2_091322.csv")

In [None]:
df_topics_outlier2 = df_topics2[df_topics2.topic == -1]
df_topics_outlier2.info()

In [None]:
df_topics_outlier2.to_csv("russia_state_ukr_outliers_v3_091322.csv")

In [None]:
# clean output

In [None]:
df_topics_outlier2=pd.read_csv('russia_state_ukr_outliers_v3_091322.csv')
df_topics_outlier2.dropna(subset=['document'])
nan_value = float("NaN")
df_topics_outlier2.replace("", nan_value, inplace=True)
df_topics_outlier2.dropna(subset = ["document"], inplace=True)
df_topics_outlier2.replace(" ", nan_value, inplace=True)
df_topics_outlier2.dropna(subset = ["document"], inplace=True)
df_topics_outlier2.info()
df_topics_outlier2.head()

In [None]:
df_topics_outlier2['cleaned_text'] = df_topics_outlier2.document.apply(text_clean)

In [None]:
df_topics_outlier2 = df_topics_outlier2.cleaned_text.to_list()

In [None]:
len(df_topics_outlier2)

In [None]:
start_time = time.time()
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)
topic_model3 = BERTopic(umap_model=umap_model, embedding_model=roberta, nr_topics="auto", calculate_probabilities = True).fit(df_topics_outlier2)
topic_model3.save("Bert_Model_Outputs/modelrussiaroberta_v3_091322") 
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
start_time = time.time()
probs3 = hdbscan.all_points_membership_vectors(topic_model3.hdbscan_model)
probs3 = topic_model3._map_probabilities(probs3, original_topics=True)
topics, probs = topic_model3.fit_transform(df_topics_outlier2)
df3 = pd.DataFrame(probs3)

In [None]:
df3.to_csv("russia_state_ukr_topics_roberta_probs_v3_091322.csv")

In [None]:
docs3 = topic_model3.get_representative_docs()
freq3 = topic_model3.get_topic_info()
doc_panda3 = pd.DataFrame(list(docs3.items()),columns = ['Topic','Docs']) 

In [None]:
freq3.to_csv("russia_state_ukr_topics_roberta_v3_091322.csv")
doc_panda3.to_csv("russia_state_ukr_topics_roberta_representative_docs_v3_091322.csv")

In [None]:
freq3

In [None]:
df_topics3 = pd.DataFrame({'topic': topics, 'document': df_topics_outlier2})
df_topics3.to_csv("russia_state_ukr_doc_topics_v3_091322.csv")

In [None]:
len(df_topics3['topic'].unique().tolist())

In [None]:
df_topics3['topiclabel'] = df_topics3['topic'] 
df_topics3.head()

In [None]:
dict3=freq3.set_index('Topic').to_dict()['Name']
df_topics3=df_topics3.replace({"topiclabel": dict3})
df_topics3 = df_topics3[['topic', 'topiclabel', 'document']]
df_topics3.head()

In [None]:
df_topics3.to_csv("russia_state_ukr_doc_topics_v3_091322.csv")

In [None]:
df_topics_outlier3 = df_topics3[df_topics3.topic == -1]
df_topics_outlier3.head()

In [None]:
df_topics_outlier3.to_csv("russia_state_ukr_outliers_v4_091322.csv")

In [None]:
# clear output

In [None]:
df_topics_outlier3=pd.read_csv('russia_state_ukr_outliers_v4_091322.csv')
df_topics_outlier3.dropna(subset=['document'])
nan_value = float("NaN")
df_topics_outlier3.replace("", nan_value, inplace=True)
df_topics_outlier3.dropna(subset = ["document"], inplace=True)
df_topics_outlier3.replace(" ", nan_value, inplace=True)
df_topics_outlier3.dropna(subset = ["document"], inplace=True)
df_topics_outlier3.info()
df_topics_outlier3.head()

In [None]:
df_topics_outlier3['cleaned_text'] = df_topics_outlier3.document.apply(text_clean)
df_topics_outlier3 = df_topics_outlier3.cleaned_text.to_list()
len(df_topics_outlier3)

In [None]:
start_time = time.time()
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)
topic_model4 = BERTopic(umap_model=umap_model, embedding_model=roberta, nr_topics="auto", calculate_probabilities = True).fit(df_topics_outlier3)
topic_model4.save("Bert_Model_Outputs/modelrussiaroberta_v4_091322") 
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
probs4 = hdbscan.all_points_membership_vectors(topic_model4.hdbscan_model)
probs4 = topic_model4._map_probabilities(probs4, original_topics=True)
topics, probs = topic_model4.fit_transform(df_topics_outlier3)
df4 = pd.DataFrame(probs4)
df4.to_csv("russia_state_ukr_topics_roberta_probs_v4_091322.csv")

In [None]:
docs4 = topic_model4.get_representative_docs()
freq4 = topic_model4.get_topic_info()
doc_panda4 = pd.DataFrame(list(docs4.items()),columns = ['Topic','Docs']) 
freq4.to_csv("russia_state_ukr_topics_roberta_v4_091322.csv")
doc_panda4.to_csv("russia_state_ukr_topics_roberta_representative_docs_v4_091322.csv")

In [None]:
freq4

In [None]:
df_topics4 = pd.DataFrame({'topic': topics, 'document': df_topics_outlier3})
df_topics4.to_csv("russia_state_ukr_doc_topics_v4_091322.csv")

In [None]:
len(df_topics4['topic'].unique().tolist())

In [None]:
df_topics4['topiclabel'] = df_topics4['topic'] 
df_topics4.head()

In [None]:
dict4=freq4.set_index('Topic').to_dict()['Name']
df_topics4=df_topics4.replace({"topiclabel": dict4})
df_topics4 = df_topics4[['topic', 'topiclabel', 'document']]
df_topics4.head()

In [None]:
df_topics4.to_csv("russia_state_ukr_doc_topics_v4_091322.csv")

In [None]:
df_topics_outlier4 = df_topics4[df_topics4.topic == -1]
df_topics_outlier4.head()

In [None]:
df_topics_outlier4.to_csv("russia_state_ukr_outliers_v5_091322.csv")

In [None]:
# clear outputs

In [None]:
df_topics_outlier4=pd.read_csv('russia_state_ukr_outliers_v5_091322.csv')
df_topics_outlier4.dropna(subset=['document'])
nan_value = float("NaN")
df_topics_outlier4.replace("", nan_value, inplace=True)
df_topics_outlier4.dropna(subset = ["document"], inplace=True)
df_topics_outlier4.replace(" ", nan_value, inplace=True)
df_topics_outlier4.dropna(subset = ["document"], inplace=True)
df_topics_outlier4.info()
df_topics_outlier4.head()

In [None]:
df_topics_outlier4['cleaned_text'] = df_topics_outlier4.document.apply(text_clean)
df_topics_outlier4 = df_topics_outlier4.cleaned_text.to_list()
len(df_topics_outlier4)

In [None]:
start_time = time.time()
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)
topic_model5 = BERTopic(umap_model=umap_model, embedding_model=roberta, nr_topics="auto", calculate_probabilities = True).fit(df_topics_outlier4)
topic_model5.save("Bert_Model_Outputs/modelrussiaroberta_v5_091322") 
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
probs5 = hdbscan.all_points_membership_vectors(topic_model5.hdbscan_model)
probs5 = topic_model5._map_probabilities(probs5, original_topics=True)
topics, probs = topic_model5.fit_transform(df_topics_outlier4)
df5 = pd.DataFrame(probs5)
df5.to_csv("russia_state_ukr_topics_roberta_probs_v5_091322.csv")

In [None]:
docs5 = topic_model5.get_representative_docs()
freq5 = topic_model5.get_topic_info()
doc_panda5 = pd.DataFrame(list(docs5.items()),columns = ['Topic','Docs']) 
freq5.to_csv("russia_state_ukr_topics_roberta_v5_091322.csv")
doc_panda5.to_csv("russia_state_ukr_topics_roberta_representative_docs_v5_091322.csv")

In [None]:
freq5

In [None]:
df_topics5 = pd.DataFrame({'topic': topics, 'document': df_topics_outlier4})
df_topics5.to_csv(" russia_state_ukr_doc_topics_v5_091322.csv")

In [None]:
len(df_topics5['topic'].unique().tolist())

In [None]:
df_topics5['topiclabel'] = df_topics5['topic'] 
df_topics5.head()

In [None]:
dict5=freq5.set_index('Topic').to_dict()['Name']
df_topics5=df_topics5.replace({"topiclabel": dict5})
df_topics5 = df_topics5[['topic', 'topiclabel', 'document']]
df_topics5.head()

In [None]:
df_topics5.to_csv("russia_state_ukr_doc_topics_v5_091322.csv")

In [None]:
df_topics_outlier5 = df_topics5[df_topics5.topic == -1]
df_topics_outlier5.head()

In [None]:
df_topics_outlier5.to_csv("russia_state_ukr_outliers_v6_091322.csv")

In [None]:
# clear output

In [None]:
df_topics_outlier5=pd.read_csv('russia_state_ukr_outliers_v6_091322.csv')
df_topics_outlier5.dropna(subset=['document'])
nan_value = float("NaN")
df_topics_outlier5.replace("", nan_value, inplace=True)
df_topics_outlier5.dropna(subset = ["document"], inplace=True)
df_topics_outlier5.replace(" ", nan_value, inplace=True)
df_topics_outlier5.dropna(subset = ["document"], inplace=True)
df_topics_outlier5.info()
df_topics_outlier5.head()

In [None]:
df_topics_outlier5['cleaned_text'] = df_topics_outlier5.document.apply(text_clean)
df_topics_outlier5 = df_topics_outlier5.cleaned_text.to_list()
len(df_topics_outlier5)

In [None]:
start_time = time.time()
umap_model = UMAP(n_neighbors=15, n_components=5, 
                  min_dist=0.0, metric='cosine', random_state=42)
topic_model6 = BERTopic(umap_model=umap_model, embedding_model=roberta, nr_topics="auto", calculate_probabilities = True).fit(df_topics_outlier5)
topic_model6.save("Bert_Model_Outputs/modelrussiaroberta_v6_091322") 
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
probs6 = hdbscan.all_points_membership_vectors(topic_model6.hdbscan_model)
probs6 = topic_model6._map_probabilities(probs5, original_topics=True)
topics, probs = topic_model5.fit_transform(df_topics_outlier5)
df6 = pd.DataFrame(probs6)
df6.to_csv("russia_state_ukr_topics_roberta_probs_v6_091322.csv")

In [None]:
docs6 = topic_model6.get_representative_docs()
freq6 = topic_model6.get_topic_info()
doc_panda6 = pd.DataFrame(list(docs6.items()),columns = ['Topic','Docs']) 
freq6.to_csv("russia_state_ukr_topics_roberta_v6_091322.csv")
doc_panda6.to_csv("russia_state_ukr_topics_roberta_representative_docs_v6_091322.csv")

In [None]:
freq6

In [None]:
df_topics6 = pd.DataFrame({'topic': topics, 'document': df_topics_outlier5})
df_topics6.to_csv("russia_state_ukr_doc_topics_v6_091322.csv")

In [None]:
len(df_topics6['topic'].unique().tolist())

In [None]:
df_topics6['topiclabel'] = df_topics6['topic'] 
df_topics6.head()

In [None]:
dict6=freq6.set_index('Topic').to_dict()['Name']
df_topics6=df_topics6.replace({"topiclabel": dict6})
df_topics6 = df_topics6[['topic', 'topiclabel', 'document']]
df_topics6.head()

In [None]:
df_topics6.to_csv("russia_state_ukr_doc_topics_v6_091322.csv")

In [None]:
df_topics_outlier6 = df_topics6[df_topics6.topic == -1]
df_topics_outlier6.head()

In [None]:
df_topics_outlier6.to_csv("russia_state_ukr_outliers_v7_091322.csv")

In [None]:
# clear output