In [None]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# Load your dataset
df = pd.read_csv('/content/drive/MyDrive/drugtrain.csv')

# Initialize GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Assign a padding token
model = GPT2Model.from_pretrained('gpt2')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2SdpaAttention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [None]:
# Function to get embeddings from GPT-2 in batches
def get_embedding_batch(reviews):
    inputs = tokenizer(reviews, return_tensors='pt', truncation=True, padding=True, max_length=1024)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    review_embeddings = last_hidden_states.mean(dim=1).cpu().numpy()
    return review_embeddings

# Function to score reviews using softmax-enhanced cosine similarity
def score_reviews_softmax_cosine(review_embedding, reference_embeddings):
    similarities = [cosine_similarity([review_embedding], [ref_embed])[0][0] for ref_embed in reference_embeddings]
    softmax_scores = softmax(similarities)
    return max(softmax_scores)

In [None]:
# Function to calculate Precision at Top-N
def precision_at_top_n(df, top_n, relevant_criteria):
    # Determine if a review is relevant based on relevant_criteria
    df.loc[:, 'is_relevant'] = df['rating'].apply(lambda x: x >= relevant_criteria)

    # Sort by review score and rating
    sorted_df = df.dropna(subset=['review_score']).sort_values(by=['review_score', 'rating'], ascending=[False, False])

    # Select top N reviews
    top_n_df = sorted_df.head(top_n)

    # Calculate Precision at Top-N
    relevant_count = top_n_df['is_relevant'].sum()
    precision = relevant_count / top_n if top_n > 0 else 0
    return precision

In [None]:
from scipy.special import softmax

In [None]:
def ndcg_at_top_n(df, top_n):
    df = df.sort_values(by=['review_score', 'rating'], ascending=[False, False])
    dcg = 0.0
    idcg = 0.0
    for i, row in enumerate(df.head(top_n).itertuples()):
        rel_i = row.is_relevant
        dcg += (2**rel_i - 1) / np.log2(i + 2)
        idcg += (2**1 - 1) / np.log2(i + 2)
    ndcg = dcg / idcg if idcg > 0 else 0
    return ndcg


In [None]:
# Main function to get top reviews for a specific drug
def get_top_reviews_for_drug(drug_name, top_n=5, relevant_criteria=4):
    # Filter reviews by drug name
    filtered_df = df[df['drugName'].str.contains(drug_name, case=False, na=False)]

    if filtered_df.empty:
        print(f"No reviews found for the drug: {drug_name}")
        return

    # Generate embeddings in batches
    batch_size = 16
    all_embeddings = []

    for i in tqdm(range(0, len(filtered_df), batch_size)):
        batch_reviews = filtered_df['review'][i:i + batch_size].tolist()
        batch_embeddings = get_embedding_batch(batch_reviews)
        all_embeddings.extend(batch_embeddings)

    filtered_df.loc[:, 'review_embedding'] = all_embeddings

    # Reference embeddings (optional: can use a reference review or a set of keywords)
    reference_embeddings = [np.mean(all_embeddings, axis=0)]  # Using mean embedding as reference

    # Score reviews using softmax-enhanced cosine similarity
    filtered_df.loc[:, 'review_score'] = filtered_df['review_embedding'].apply(lambda emb: score_reviews_softmax_cosine(emb, reference_embeddings))

    # Calculate Precision at Top-N
    precision = precision_at_top_n(filtered_df, top_n, relevant_criteria)
    recall = recall_at_top_n(filtered_df, top_n, relevant_criteria)
    mrr = mean_reciprocal_rank(filtered_df)
    ndcg = ndcg_at_top_n(filtered_df, top_n)

    # Sort by review score and rating
    sorted_df = filtered_df.dropna(subset=['review_score']).sort_values(by=['review_score', 'rating'], ascending=[False, False])

    # Display the top reviews with full review text
    top_reviews = sorted_df.head(top_n)  # Adjust the number as needed
    pd.set_option('display.max_colwidth', None)  # Ensure full review text is displayed
    print(top_reviews[['review', 'rating']])

    # Print Precision at Top-N
    print(f"Precision at Top-{top_n}: {precision:.2f}")
    print(f"NDCG at Top-{top_n}: {ndcg:.2f}")


# Example usage
drug_name = "Aspirin"  # Replace with the desired drug name
get_top_reviews_for_drug(drug_name, top_n=10, relevant_criteria=4)

100%|██████████| 15/15 [00:03<00:00,  4.18it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.loc[:, 'review_embedding'] = all_embeddings


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                review  \
310                                                                                                                                                                                                                                                                           

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.loc[:, 'review_score'] = filtered_df['review_embedding'].apply(lambda emb: score_reviews_softmax_cosine(emb, reference_embeddings))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'is_relevant'] = df['rating'].apply(lambda x: x >= relevant_criteria)
