### Packages

In [2]:
# Essentials
import pandas as pd

# ML Modules
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from sklearn import set_config; set_config(display='diagram')

import pickle

# For preprosessing
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

# Graphs
import matplotlib.pyplot as plt

### Data Import

In [3]:
file = '/home/nawar82/code/nawar82/ResearchTopicsRanker/raw_data/abstracts_thyrotropin-OR-TSH_2024-03-21_10-18-44.csv'

data = pd.read_csv(file)
data.head()

Unnamed: 0,PMID,Abstract
0,38506164,The effectiveness of levothyroxine (LT4) in re...
1,38464371,Diabetes and thyroiditis are closely related. ...
2,38449627,The thyroid represents the most prevalent form...
3,38296053,Evidence suggests that hypothyroidism may be a...
4,38287682,The aim of this study was to determine the imp...


In [4]:
data.shape

(640, 2)

### Preprocessing

#### preparing Acedemic and Medical Stopwords (RedundantWords.csv)

In [5]:
rw = '/home/nawar82/code/nawar82/ResearchTopicsRanker/raw_data/RedundantWords.csv'

sw = pd.read_csv(rw, header=0, names=['academic_stopwords'])
sw.head()

Unnamed: 0,academic_stopwords
0,Academic
1,Achievement
2,Advisor
3,Analysis
4,Argument


In [6]:
academic_Stopwords = sw['academic_stopwords'].tolist()
len(academic_Stopwords)

92

In [7]:
def preprocessing(sentence):

    # remove whitespace
    sentence = sentence.strip()

    # lowercase characters
    sentence = sentence.lower()

    # remove numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())

    # remove punctuation
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')

    # remove stop_words
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(sentence)
    filtered_words = [word for word in words if word.lower() not in stop_words]

    # remove academic and medical stop_words
    filtered_words = [word for word in filtered_words if word.lower() not in academic_Stopwords]
    sentence = ' '.join(filtered_words)

    # tokenize and lemmatize
    words = word_tokenize(sentence)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in words]  # Lemmatize verbs
    lemmatized_words = [lemmatizer.lemmatize(word, pos='n') for word in lemmatized_words]  # Lemmatize nouns

    return ' '.join(lemmatized_words)

In [8]:
data['clean_text'] = data['Abstract'].apply(preprocessing)
data

Unnamed: 0,PMID,Abstract,clean_text
0,38506164,The effectiveness of levothyroxine (LT4) in re...,effectiveness levothyroxine lt restore thyroid...
1,38464371,Diabetes and thyroiditis are closely related. ...,diabetes thyroiditis closely relate occur comb...
2,38449627,The thyroid represents the most prevalent form...,thyroid represent prevalent form head neck end...
3,38296053,Evidence suggests that hypothyroidism may be a...,evidence suggest hypothyroidism may associate ...
4,38287682,The aim of this study was to determine the imp...,aim study determine impact covid pandemic test...
...,...,...,...
635,6127000,"A population sample of women in Göteborg, Swed...",population sample woman göteborg sweden study ...
636,7288274,This paper presents the results of various met...,paper present result various metabolic investi...
637,6794011,Low concentrations of thyrotropin-releasing ho...,low concentration thyrotropinreleasing hormone...
638,469388,The aetiology of Menière's disease is unknown ...,aetiology menières disease unknown recent year...


### Functions

In [9]:
def lda_model_fit(n_components, max_iter, vectorized_documents):
    lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = max_iter)

    # Fit the LDA on the vectorized documents
    lda_model.fit(vectorized_documents)

    return lda_model

In [10]:
def topics_list(model, vectorizer, top_words):
    topics = []
    for idx, topic in enumerate(model.components_):
        # print("Topic %d:" % (idx))
        topic = [(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-top_words - 1:-1]]
        # print(topic)
        topics.append(topic)
    return topics

In [11]:
def print_topics(model, vectorizer):
    topics = []
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        topic = [(vectorizer.get_feature_names_out()[i], topic[i]) for i in topic.argsort()[:-10 - 1:-1]]
        print(topic)
        topics.append(topic)
    return topics

In [12]:
def graph_topics(topics):
    # Calculating the number of rows and columns
    num_sets = len(topics)
    num_cols = 2  # Number of columns you want
    num_rows = -(-num_sets // num_cols)  # Ceiling division to ensure enough rows

    # Plotting each data set
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))
    for i, data in enumerate(topics):
        row = i // num_cols
        col = i % num_cols
        labels = [item[0] for item in data]
        values = [item[1] for item in data]
        axs[row, col].barh(labels, values, color='skyblue')
        axs[row, col].set_ylabel('Keywords')
        axs[row, col].set_title('Topic {}'.format(i))
        axs[row, col].invert_yaxis()

    plt.xlabel('Score')
    plt.tight_layout()
    plt.show()

### Run LDA to see the topics

In [13]:
vectorizer = TfidfVectorizer(max_df = 0.98, ngram_range=(2,5))

vectorized_documents = vectorizer.fit_transform(data['clean_text'])

In [14]:
vectorized_documents.shape

(640, 339671)

In [15]:
# Instantiate the LDA
n_components = 10
lda_model = LatentDirichletAllocation(n_components=n_components, max_iter = 500)

# Fit the LDA on the vectorized documents
lda_model.fit(vectorized_documents)

In [None]:
document_topic_mixture = pd.DataFrame(lda_model.transform(vectorized_documents),
                                    columns = [f"topic_{i}" for i in range(1, n_components+1)],
                                    index = data['PMID'])
document_topic_mixture

In [None]:
topic_word_mixture = pd.DataFrame(
    lda_model.components_,
    columns = vectorizer.get_feature_names_out(),
    index = document_topic_mixture.columns
)
topic_word_mixture

In [None]:
topics = topics_list(lda_model, vectorizer, 5)

In [None]:
type(topics)

In [None]:
type(topics[0])

In [None]:
type(topics[0][0])

In [None]:
topics[1][0]

In [None]:
len(topics)

In [None]:
topics

In [None]:
lda_model.components_.shape

In [None]:
vectorizer.get_feature_names_out().shape

In [None]:
lda_model.components_[1].sum()

### Visualization

In [None]:
data = topics[0]

labels = [item[0] for item in data]
values = [item[1] for item in data]

# Creating bar chart
plt.figure(figsize=(10, 6))
plt.barh(labels, values, color='skyblue')
plt.xlabel('Score')
plt.ylabel('Keywords')
plt.title('Keyword Scores')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest score at the top
plt.show()

In [None]:
graph_topics(topics)

## distributing the documents into their topics

In [None]:
document_topic_mixture['Dominant_topic'] = document_topic_mixture.idxmax(axis=1)

In [None]:
document_topic_mixture

## Plotting the dominant topics distribution

In [None]:
# Count how many documents have each topic as the most dominant
topic_counts = document_topic_mixture['Dominant_topic'].value_counts()
topic_counts

In [None]:

# Plot the pie chart
plt.figure(figsize=(10, 8))
plt.pie(topic_counts, labels=topic_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Most Dominant Topics Across Documents')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.show()