In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install BERTopic
!pip install kaleido
!pip install wordcloud

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import cv2
from google.colab.patches import cv2_imshow
from PIL import Image

from bertopic import BERTopic

from wordcloud import WordCloud

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [None]:
#Read data.

data = pd.read_csv('/content/drive/MyDrive/Topic Modeling-GBM/data.csv', index_col = 0, encoding = 'latin1', low_memory = False)

In [None]:
#Make lists of metadata items.

tiab = data['Title + Abstract'].tolist()
citation_quartiles = data['Citation Quartiles'].tolist()
journals = data['Journal'].tolist()
years = data['Year'].tolist()

In [None]:
#Load the model.

model_path = "/content/drive/MyDrive/Topic Modeling-GBM/topic_model"
topic_model = BERTopic.load(model_path)

In [None]:
#Assign topic labels.

representative_docs = pd.read_csv('/content/drive/MyDrive/Topic Modeling-GBM/representative_docs.csv', index_col = 0, encoding = 'latin1', low_memory = False)

topic_labels = representative_docs.index.tolist()

topic_model.set_topic_labels(topic_labels)

topic_model.get_topic_info()

In [None]:
topic_model.get_document_info(tiab)

In [None]:
#Get document info with topic and save the spreadsheet.

doc_info = topic_model.get_document_info(tiab)
doc_info = pd.DataFrame(doc_info)
doc_info = doc_info[['Document', 'CustomName', 'Probability', 'Representative_document']]

doc_info['Citation Quartile'] = citation_quartiles
doc_info['Year'] = years
doc_info['Journal'] = journals

doc_info = doc_info[doc_info['CustomName'] != 'Outliers']

doc_info.to_csv('/content/drive/MyDrive/Topic Modeling-GBM/doc_info.csv')

In [None]:
#Get top 10 topics for visualization.

top10_topics = doc_info['CustomName'].value_counts().index.tolist()

In [None]:
#Word cloud analysis.

def create_wordcloud(topic_model, topic):
    text = {word: value for word, value in topic_model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000, width=600, height=400, margin=1, colormap='plasma')
    wc.generate_from_frequencies(text)
    return wc

def save_wordcloud_image(topic_model, topic):
    wc = create_wordcloud(topic_model, topic)
    fig = plt.figure()
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(f"{topic_labels[topic+1]}", fontsize=20, y=1.05, fontweight='heavy')
    output_dir = "/content/drive/MyDrive/Topic Modeling-GBM/"
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, f"wordcloud_topic{topic+1}.jpg"), format="jpg", dpi=300, pad_inches=0.3)
    plt.clf()
    plt.close(fig)

def crop_image(image, crop_percent):
    width = image.shape[1]
    height = image.shape[0]
    new_width = width - 2*(width * crop_percent // 100)
    new_height = height
    start_x = width * crop_percent // 100
    start_y = 0
    cropped = image[start_y:start_y+new_height, start_x:start_x+new_width]
    return cropped

for topic in range((len(topic_labels)-1)):
    save_wordcloud_image(topic_model, topic)

base_path = '/content/drive/MyDrive/Topic Modeling-GBM/'
crop_percent = 7
images_per_row = 5

wc_images = []
for i in range(1, len(topic_labels)):
    img_path = os.path.join(base_path, f'wordcloud_topic{i}.jpg')
    img = cv2.imread(img_path)
    cropped_img = crop_image(img, crop_percent)
    wc_images.append(cropped_img)

num_rows = int(np.ceil(len(wc_images) / images_per_row))
wc_rows = []

for i in range(num_rows):
    start_index = i * images_per_row
    end_index = start_index + images_per_row
    wc_row = cv2.hconcat(wc_images[start_index:end_index])
    wc_rows.append(wc_row)
    cv2.imwrite(os.path.join(base_path, f'wordclouds_row{i+1}.jpg'), wc_row, [cv2.IMWRITE_JPEG_QUALITY, 100])

last_row = wc_rows[-1]
height, width, _ = last_row.shape
desired_width = wc_rows[0].shape[1]
padding = np.full((height, desired_width - width, 3), 255, dtype=np.uint8)
padded_last_row = cv2.hconcat([last_row, padding])
wc_rows[-1] = padded_last_row

wc = cv2.vconcat(wc_rows)

height, width, _ = wc.shape
padding_top = np.full((200, width, 3), 255, dtype=np.uint8)
wc = cv2.vconcat([padding_top, wc])

#cv2_imshow(wc)
cv2.imwrite(os.path.join(base_path, 'wordclouds.jpg'), wc, [cv2.IMWRITE_JPEG_QUALITY, 100])
cv2.imwrite(os.path.join(base_path, 'wordclouds.tiff'), wc, [cv2.IMWRITE_JPEG_QUALITY, 100])

In [None]:
#Visualize topics per citation quartile.

per_citation_quartile = doc_info

grouped = per_citation_quartile.groupby(['Citation Quartile', 'CustomName']).size().unstack(fill_value=0)
grouped = grouped[top10_topics]
grouped = grouped.sort_values(by='Citation Quartile', ascending=False)

ax = grouped.plot(kind='barh', stacked=True, figsize=(18, 8), colormap='tab20')
ax.legend(labels=top10_topics, loc='center left', bbox_to_anchor=(1, 0.5), fontsize=12)
ax.set_xlabel('Number of Papers', fontsize=16, fontweight='heavy', labelpad=16)
ax.set_ylabel('Citation Quartile', fontsize=16, fontweight='heavy', labelpad=16)
ax.tick_params(axis="y", direction="out", labelsize=12)
ax.tick_params(axis="x", direction="out", labelsize=12)

for bar in ax.containers:
    labels = [int(v) if v > 0 else "" for v in bar.datavalues]
    ax.bar_label(bar, labels=labels, label_type="center", fontsize=10, color='white', weight = 'bold', rotation=90)

plt.savefig('/content/drive/MyDrive/Topic Modeling-GBM/per_citation_quartile.jpg', dpi=300, bbox_inches='tight')
plt.savefig('/content/drive/MyDrive/Topic Modeling-GBM/per_citation_quartile.tiff', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
#Visualize topics per journal.

per_journal = doc_info[doc_info['Journal'] != 'Other']

grouped = per_journal.groupby(['Journal', 'CustomName']).size().unstack(fill_value=0)
for topic in top10_topics:
    if topic not in grouped.columns:
        grouped[topic] = 0
grouped = grouped[top10_topics]
grouped['total'] = grouped.sum(axis=1)
grouped = grouped.sort_values(by='total', ascending=True).drop('total', axis=1)

ax = grouped.plot(kind='barh', stacked=True, figsize=(16, 8), colormap='tab20')
ax.legend(labels=top10_topics, loc='center left', bbox_to_anchor=(1, 0.5), fontsize=12)
ax.set_xlabel('Number of Papers', fontsize=16, fontweight='heavy', labelpad=16)
ax.set_ylabel('Journal', fontsize=16, fontweight='heavy', labelpad=16)
ax.tick_params(axis="y", direction="out", labelsize=12)
ax.tick_params(axis="x", direction="out", labelsize=12)

for bar in ax.containers:
    labels = [int(v) if v > 30 else "" for v in bar.datavalues]
    ax.bar_label(bar, labels=labels, label_type="center", fontsize=10, color='white', weight = 'bold')

plt.savefig('/content/drive/MyDrive/Topic Modeling-GBM/per_journal.jpg', dpi=300, bbox_inches='tight')
plt.savefig('/content/drive/MyDrive/Topic Modeling-GBM/per_journal.tiff', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
#Define hot and cold topics (after 2020).

doc_info_after2020 = doc_info[doc_info['Year'] > 2019]

doc_topic_probs = doc_info_after2020['Probability'].values
doc_years = doc_info_after2020['Year'].values
doc_topics = doc_info_after2020['CustomName'].values

topic_probs_by_year = pd.DataFrame({'Year': doc_years, 'Topic': doc_topics, 'Probability': doc_topic_probs}).groupby(['Year', 'Topic']).mean().reset_index()

unique_years = topic_probs_by_year['Year'].unique()
unique_topics = topic_probs_by_year['Topic'].unique()
lin_reg_models = {}

for topic in unique_topics:
    topic_data = topic_probs_by_year[topic_probs_by_year['Topic'] == topic]
    X = topic_data['Year'].values.reshape(-1, 1)
    y = topic_data['Probability'].values
    lin_reg = LinearRegression().fit(X, y)
    lin_reg_models[topic] = lin_reg

hot_topics = []
cold_topics = []

for topic in unique_topics:
    slope = lin_reg_models[topic].coef_[0]

    if slope > 0:
        hot_topics.append((topic, slope))
    elif slope < 0:
        cold_topics.append((topic, slope))

hot_topics_sorted = sorted(hot_topics, key=lambda x: abs(x[1]), reverse=True)
cold_topics_sorted = sorted(cold_topics, key=lambda x: abs(x[1]), reverse=True)

print(hot_topics_sorted)
print(cold_topics_sorted)

In [None]:
#Visualize hot and cold topics (after 2020).

all_topics_sorted = sorted(hot_topics + cold_topics, key=lambda x: x[1], reverse=False)
topics, slopes = zip(*all_topics_sorted)

normalized_slopes = (np.array(slopes) - min(slopes)) / (max(slopes) - min(slopes))

colors = plt.cm.plasma(normalized_slopes)

plt.figure(figsize=(6, 8))
plt.barh(topics, slopes, color=colors)
plt.xlabel('Slope Values', fontsize = 12, fontweight = 'heavy', labelpad = 8)

sm = plt.cm.ScalarMappable(cmap='plasma', norm=plt.Normalize(vmin=min(slopes), vmax=max(slopes)))
colorbar = plt.colorbar(sm)
colorbar.set_label('Hotness Spectrum', fontsize=12, fontweight='heavy', labelpad=8)

plt.savefig('/content/drive/MyDrive/Topic Modeling-GBM/hot_topics_after2020.jpg', dpi=300, bbox_inches='tight', pad_inches=0.5)
plt.savefig('/content/drive/MyDrive/Topic Modeling-GBM/hot_topics_after2020.tiff', dpi=300, bbox_inches='tight', pad_inches=0.5)
plt.show()