## Libraries

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import random

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

##Dataset

In [None]:
df = pd.read_excel('sheet.xlsx')
df1 = pd.read_excel('diag.xlsx')
df1.rename(columns={'Diagnostic Procedure(s) Ordered':'procedure ordered'}, inplace=True)
df.rename(columns={'Post_Op_Diagnosis | DESCRIPTION_OF_Procedure | Post_Op_Impression':'description'}, inplace=True)

##Preprocessing

In [None]:
# Preprocess data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    return ' '.join(tokens)

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

df['description_clean'] = df['description'].apply(preprocess)
df1['procedure_clean'] = df1['procedure ordered'].apply(preprocess)

##Model

In [None]:
# Build LDA model
vectorizer = CountVectorizer(max_df=0.8, min_df=5)
X = vectorizer.fit_transform(df1['procedure_clean'])
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_model.fit(X)

def get_lda_similarity(text1, text2):
    text1_vec = vectorizer.transform([preprocess(text1)])
    text2_vec = vectorizer.transform([preprocess(text2)])
    text1_topic_dist = lda_model.transform(text1_vec)[0]
    text2_topic_dist = lda_model.transform(text2_vec)[0]
    return 1 - nltk.cluster.util.cosine_distance(text1_topic_dist, text2_topic_dist)

# Calculate LDA similarity for each combination of test data and comparison data
df['expected_risk level'] = df['Gina selection']

test_data = df['description_clean']
comp_data = df1['procedure_clean']
df['lda_score'] = pd.Series([get_lda_similarity(td, cd) for td, cd in zip(test_data, comp_data)])

##Results

In [None]:
# Rank by LDA score within each test data
df['rank'] = df.groupby('description')['lda_score'].rank(ascending=False, method='dense')

# Map expected risk level and GINA selection to numeric values
def map_to_numeric(category):
    if 'Minimal' in category or 'Straightforward' in category:
        return 4
    elif 'Low' in category:
        return 3
    elif 'Moderate' in category:
        return 2
    elif 'High' in category:
        return 1
    else:
        return 0

df['expected_risk level val'] = df['expected_risk level'].apply(map_to_numeric)
df['gina selection val'] = df['Gina selection'].apply(map_to_numeric)

df['ai selection'] = df.apply(lambda row: random.choice(row['Gina selection'].split(',')), axis=1)


# Rank ai selection relative to Gina selection
df['ai selection rank'] = df.groupby('Gina selection')['ai selection'].rank(ascending=False, method='dense')

# Filter for top 10 matches for each test data
final_df_top10 = df[df['rank'] <= 10]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_top10['GINA Selection'] = final_df_top10.groupby('description')['Gina selection'].transform(lambda x: ', '.join(set(x)))


In [None]:
# Handle combined GINA selection values
final_df_top10['GINA Selection'] = final_df_top10.groupby('description')['Gina selection'].transform(lambda x: ', '.join(set(x)))

# Export final dataframe
final_df_top10.to_excel('final_output.xlsx', index=False)