In [6]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import nltk
from nltk.corpus import stopwords

In [19]:
# Load your dataset (from CSV or Excel)
df = pd.read_csv("scopus.csv")  

# Check available column names
print("Columns in dataset:", df.columns)

# Ensure necessary columns exist
if "Title" not in df.columns or "Abstract" not in df.columns:
    raise ValueError("CSV file must contain 'Title' and 'Abstract' columns.")
    

Columns in dataset: Index(['Title', 'Link', 'Abstract'], dtype='object')


In [20]:
# Preprocess Text (Lowercasing & Removing Punctuation)
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Download stopwords (if not already downloaded)
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Apply text cleaning to Title and Abstract
df["Title"] = df["Title"].apply(clean_text)
df["Abstract"] = df["Abstract"].apply(clean_text)

# Combine Title & Abstract
df["text"] = df["Title"] + ". " + df["Abstract"]

# Remove missing values
df.dropna(subset=["text"], inplace=True)

# Preview data
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MasseiM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Title,Link,Abstract,text
0,microcredit pricing model microfinance institu...,https://www.scopus.com/inward/record.uri?eid=2...,purpose purpose research propose tool designin...,microcredit pricing model microfinance institu...
1,sdcl framework secure distributed collaborativ...,https://www.scopus.com/inward/record.uri?eid=2...,future electric grids undergoing remarkable tr...,sdcl framework secure distributed collaborativ...
2,digital technologies ai contribute achieving h...,https://www.scopus.com/inward/record.uri?eid=2...,enhancing global health stands pivotal objecti...,digital technologies ai contribute achieving h...
3,navigating green wave urban climate adaptation...,https://www.scopus.com/inward/record.uri?eid=2...,climate mitigation climate adaptation two main...,navigating green wave urban climate adaptation...
4,deepesn neural networks industrial predictive ...,https://www.scopus.com/inward/record.uri?eid=2...,optimizing energy consumption important aspect...,deepesn neural networks industrial predictive ...


In [21]:
# Load Pre-trained BERT Model for Embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings
embeddings = model.encode(df["text"].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/14 [00:00<?, ?it/s]

In [22]:
from umap import UMAP
# Reduce outliers by adjusting UMAP
umap_model = UMAP(n_neighbors=5, min_dist=0.1, n_components=5)

# Apply to BERTopic
topic_model = BERTopic(umap_model=umap_model)

# Fit BERTopic using precomputed embeddings
topics, probs = topic_model.fit_transform(df["text"].tolist(), embeddings)

In [23]:
# Display Key Topics and Their Distribution
topic_info = topic_model.get_topic_info()
print(topic_info)

   Topic  Count                                     Name  \
0     -1     60             -1_risk_data_using_financial   
1      0    244       0_data_system_model_sustainability   
2      1     32              1_risk_esg_credit_financial   
3      2     28           2_climate_financial_risk_banks   
4      3     22  3_financial_bankruptcy_companies_models   
5      4     19            4_extreme_stock_events_market   
6      5     15       5_fiscal_economic_growth_countries   

                                      Representation  \
0  [risk, data, using, financial, analysis, model...   
1  [data, system, model, sustainability, risk, st...   
2  [risk, esg, credit, financial, learning, machi...   
3  [climate, financial, risk, banks, carbon, chan...   
4  [financial, bankruptcy, companies, models, mod...   
5  [extreme, stock, events, market, financial, mo...   
6  [fiscal, economic, growth, countries, deficit,...   

                                 Representative_Docs  
0  [sovereign c

In [28]:
# Identify Representative Documents for Each Topic
for topic in topic_info["Topic"].unique():
    if topic == -1:  # Skip outliers
        continue
    print(f"\nTopic {topic}: {topic_info[topic_info['Topic'] == topic]['Name'].values[0]}")
    topic_docs = df[topics == topic]["text"].head(3)  # Get top 3 representative documents
    print("Representative Documents:")
    for doc in topic_docs:
        print(f"- {doc[:200]}...")  # Print first 200 characters


Topic 0: 0_data_system_model_sustainability
Representative Documents:
- sdcl framework secure distributed collaborative learning smart grids. future electric grids undergoing remarkable transformation driven increasing adoption emerging technologies notably artificial int...
- digital technologies ai contribute achieving healthrelated sdgs. enhancing global health stands pivotal objective within united nations sustainable development goals sdgs wake ongoing digital transfor...
- deepesn neural networks industrial predictive maintenance anomaly detection production energy data. optimizing energy consumption important aspect industrial competitiveness directly impacts operation...

Topic 1: 1_risk_esg_credit_financial
Representative Documents:
- towards estimation esg ratings machine learning approach using balance sheet ratios. despite persistence methodological inconsistency uncertainty esg ratings useful assessing environmental e social go...
- novel hybrid model credit risk assessme

In [25]:
# Visualize Topics
topic_model.visualize_barchart()


In [26]:
topic_model.visualize_topics()

In [27]:
# Save Model (Optional)
topic_model.save("bertopic_model")

# Save Topic Assignments to CSV
df["Topic"] = topics
df.to_csv("scopus_topics_cleaned.csv", index=False)

