In [None]:
import re
import nltk
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.decomposition import NMF, LatentDirichletAllocation as LDA
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from collections import Counter
from google.colab import drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:

# Load the dataset
train_data = pd.read_csv('/content/drive/My Drive/drug_cleanedtrain.csv')
test_data = pd.read_csv('/content/drive/My Drive/drug_cleanedtest.csv')
data = pd.concat([train_data, test_data])



In [None]:
# Preprocess text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ''  # Handle non-string values
    text = text.lower().strip()  # Lowercase and strip
    text = re.sub(f'[{string.punctuation}]', '', text)  # Remove punctuation
    text = re.sub('\d+', '', text)  # Remove numbers
    return text

In [None]:
# Apply preprocessing
data['review'] = data['review'].apply(preprocess_text)
data = data.fillna('')

In [None]:
# Encode the labels
# Ensure the LabelEncoders are fitted properly
le_drug = LabelEncoder().fit(data['drugName'])
le_condition = LabelEncoder().fit(data['condition'])


In [None]:
# Split the data
X = data['review']
y = data[['drugName', 'condition']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize vectorizers and fit on the training data
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english')
count_vectorizer = CountVectorizer(max_df=1.0, min_df=1, stop_words='english')

In [None]:
# Fit the vectorizers
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
cv_train = count_vectorizer.fit_transform(X_train)
cv_test = count_vectorizer.transform(X_test)

In [None]:

# Combine the features
X_train_combined = hstack([tfidf_train, cv_train])
X_test_combined = hstack([tfidf_test, cv_test])

In [None]:
# Tokenize and pad sequences for CNN model
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_length = 500
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length)

In [None]:
# Import necessary modules
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dropout, Flatten, Dense

# Create the CNN model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2), # Now MaxPooling1D is recognized
    Dropout(0.5),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2), # Now MaxPooling1D is recognized
    Dropout(0.5),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(y_train.shape[1], activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))



Epoch 1/5
[1m5377/5377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m850s[0m 157ms/step - accuracy: 0.9241 - loss: 97632715536334848.0000 - val_accuracy: 0.9252 - val_loss: 1993235773079420928.0000
Epoch 2/5
[1m5377/5377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m813s[0m 148ms/step - accuracy: 0.9239 - loss: 5078085656790433792.0000 - val_accuracy: 0.9252 - val_loss: 22504880963133112320.0000
Epoch 3/5
[1m5377/5377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m774s[0m 143ms/step - accuracy: 0.9246 - loss: nan - val_accuracy: 0.9252 - val_loss: nan
Epoch 4/5
[1m5377/5377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m795s[0m 142ms/step - accuracy: 0.9249 - loss: nan - val_accuracy: 0.9252 - val_loss: nan
Epoch 5/5
[1m5377/5377[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m808s[0m 143ms/step - accuracy: 0.9246 - loss: nan - val_accuracy: 0.9252 - val_loss: nan


<keras.src.callbacks.history.History at 0x7d8eea369ff0>

In [None]:

model.save('/content/drive/My Drive/drug_model.h5')





In [None]:
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer  # Import from tensorflow.keras
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Import from tensorflow.keras
from tensorflow.keras.utils import to_categorical
from google.colab import drive
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation, NMF


In [None]:
# Preprocess text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ''  # Handle non-string values
    text = text.lower().strip()  # Lowercase and strip
    text = re.sub(f'[{string.punctuation}]', '', text)  # Remove punctuation
    text = re.sub('\d+', '', text)  # Remove numbers
    return text

train_data['cleanedReview'] = train_data['review'].apply(preprocess_text)

# Preprocess drug names to handle multiple names
def preprocess_drug_names(drug_names):
    return [name.strip() for name in drug_names.split(',')]

# Create a mapping of conditions to drug names with frequencies
def create_condition_to_drugs_mapping(df):
    condition_to_drugs = {}
    for condition, drugs in df.groupby('condition')['drugName']:
        drug_counter = Counter()
        for drug_list in drugs:
            drug_counter.update(preprocess_drug_names(drug_list))
        condition_to_drugs[condition.lower()] = drug_counter
    return condition_to_drugs

condition_to_drugs = create_condition_to_drugs_mapping(train_data)

# Function to find drugs for a given condition
def find_drugs_for_condition(condition, top_n=3):
    condition = condition.lower().strip()  # Ensure consistency
    drug_counter = condition_to_drugs.get(condition, Counter())

    # Get the top N drugs based on frequency
    top_drugs = drug_counter.most_common(top_n)
    return [drug for drug, _ in top_drugs]

# Function to get top reviews for a particular drug
def get_top_reviews_for_drug(df, drug_name, top_n=5):
    drug_reviews = df[df['drugName'].str.contains(drug_name, case=False, na=False)]
    top_reviews = drug_reviews.nlargest(top_n, 'rating')[['review', 'rating']]
    return top_reviews

# Function to highlight words based on topics
def highlight_words(text, words):
    highlighted_text = text
    for word in words:
        highlighted_text = re.sub(f'\\b({word})\\b', r'**\1**', highlighted_text, flags=re.IGNORECASE)
    return highlighted_text

# Function to extract topics using LDA and NMF
def extract_topics(reviews, num_topics=2, num_words=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf = vectorizer.fit_transform(reviews)

    # LDA
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(tfidf)
    lda_words = []
    for topic in lda.components_:
        lda_words.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]])

    # NMF
    nmf = NMF(n_components=num_topics, random_state=42)
    nmf.fit(tfidf)
    nmf_words = []
    for topic in nmf.components_:
        nmf_words.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]])

    return lda_words, nmf_words

# Example usage to find top drugs for a condition
condition = 'adhd'
top_drugs_for_condition = find_drugs_for_condition(condition)
print("Top drug names for condition", condition, ":", top_drugs_for_condition)

# Example usage to get top 5 reviews for a drug
drug_name = 'Adderall'
top_reviews = get_top_reviews_for_drug(train_data, drug_name)
print("Top reviews for drug", drug_name, ":\n", top_reviews)

# Extract topics
reviews_list = top_reviews['review'].tolist()
lda_words, nmf_words = extract_topics(reviews_list)

print("LDA topic words:", lda_words)
print("NMF topic words:", nmf_words)

# Highlight words in reviews
highlighted_reviews = [highlight_words(review, lda_words[0] + nmf_words[0]) for review in reviews_list]

print("Highlighted reviews:")
for review in highlighted_reviews:
    print(review)

Top drug names for condition adhd : ['lisdexamfetamine', 'vyvanse', 'methylphenidate']
Top reviews for drug Adderall :
                                                  review  rating
2452                                            amazing      10
3156  this medicine is amazing i used to always be t...      10
4016  i am a 23 year old male 1lbs on adderall 2mg x...      10
5781  started off with concerta in may 22 it was not...      10
5936  i have narcolepsy and i take 2mg three times p...      10
LDA topic words: [['2mg', 'xr', 'adderall', 'great', 'nap'], ['taking', 'started', 'like', 'adderall', 'amazing']]
NMF topic words: [['long', 'nap', 'great', 'xr', 'adderall'], ['lot', 'care', 'ranging', 'like', 'amazing']]
Highlighted reviews:
amazing
this medicine is amazing i used to always be the last one to turn in tests in high school now i take one in the morning and in about an hour feel like i am my true self with it i can process information clearly easily and very efficiently now 

In [None]:
# Preprocess text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ''  # Handle non-string values
    text = text.lower().strip()  # Lowercase and strip
    text = re.sub(f'[{string.punctuation}]', '', text)  # Remove punctuation
    text = re.sub('\d+', '', text)  # Remove numbers
    return text

train_data['cleanedReview'] = train_data['review'].apply(preprocess_text)

# Preprocess drug names to handle multiple names
def preprocess_drug_names(drug_names):
    return [name.strip() for name in drug_names.split(',')]

# Create a mapping of conditions to drug names with frequencies
def create_condition_to_drugs_mapping(df):
    condition_to_drugs = {}
    for condition, drugs in df.groupby('condition')['drugName']:
        drug_counter = Counter()
        for drug_list in drugs:
            drug_counter.update(preprocess_drug_names(drug_list))
        condition_to_drugs[condition.lower()] = drug_counter
    return condition_to_drugs

condition_to_drugs = create_condition_to_drugs_mapping(train_data)

# Function to find drugs for a given condition
def find_drugs_for_condition(condition, top_n=3):
    condition = condition.lower().strip()  # Ensure consistency
    drug_counter = condition_to_drugs.get(condition, Counter())

    # Get the top N drugs based on frequency
    top_drugs = drug_counter.most_common(top_n)
    return [drug for drug, _ in top_drugs]

# Function to get top reviews for a particular drug
def get_top_reviews_for_drug(df, drug_name, top_n=5):
    drug_reviews = df[df['drugName'].str.contains(drug_name, case=False, na=False)]
    top_reviews = drug_reviews.nlargest(top_n, 'rating')[['review', 'rating']]
    return top_reviews

# Function to highlight words based on topics
def highlight_words(text, words):
    highlighted_text = text
    for word in words:
        highlighted_text = re.sub(f'\\b({word})\\b', r'<mark>\1</mark>', highlighted_text, flags=re.IGNORECASE)
    return highlighted_text

# Function to extract topics using LDA and NMF
def extract_topics(reviews, num_topics=2, num_words=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf = vectorizer.fit_transform(reviews)

    # LDA
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(tfidf)
    lda_words = []
    for topic in lda.components_:
        lda_words.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]])

    # NMF
    nmf = NMF(n_components=num_topics, random_state=42)
    nmf.fit(tfidf)
    nmf_words = []
    for topic in nmf.components_:
        nmf_words.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]])

    return lda_words, nmf_words

# Main execution
def main():
    # Get input from user
    condition = input("Enter the condition: ").strip()
    drug_name = input("Enter the drug name: ").strip()

    # Find top drugs for the condition
    top_drugs_for_condition = find_drugs_for_condition(condition)
    print("Top drug names for condition", condition, ":", top_drugs_for_condition)

    # Get top reviews for the drug
    top_reviews = get_top_reviews_for_drug(train_data, drug_name)
    if top_reviews.empty:
        print(f"No reviews found for the drug {drug_name}.")
        return

    print("Top reviews for drug", drug_name, ":\n", top_reviews)

    # Extract topics
    reviews_list = top_reviews['review'].tolist()
    lda_words, nmf_words = extract_topics(reviews_list)

    print("LDA topic words:", lda_words)
    print("NMF topic words:", nmf_words)

    # Highlight words in reviews
    highlighted_reviews = [highlight_words(review, lda_words[0] + nmf_words[0]) for review in reviews_list]

    print("Highlighted reviews:")
    for review in highlighted_reviews:
        print(review)

if __name__ == "__main__":
    main()

Enter the condition: left ventricular dysfunction
Enter the drug name: valsartan
Top drug names for condition left ventricular dysfunction : ['carvedilol', 'coreg', 'enalapril']
Top reviews for drug valsartan :
                                                   review  rating
10055  i take diovan daily with norvasc 5mg  i have h...      10
11788  i have been on this only since december 27 but...      10
12432  i used several medications before my cardiolog...      10
13118            0  effective for my high blood pressure      10
17259  diovan was prescribed by my doctor nearly 10 y...      10
LDA topic words: [['pressure', 'blood', 'high', 'effective', 'issues'], ['pressure', 'prescribed', 'used', 'diovan', 'years']]
NMF topic words: [['years', 'effective', 'high', 'pressure', 'blood'], ['medicine', 'daily', 'relief', 'edema', 'issues']]
Highlighted reviews:
i take diovan daily with norvasc 5mg  i have had no <mark>issues</mark> with either medicine  i have had no <mark>issues</mark>

In [None]:
import re
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer  # Import from tensorflow.keras
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Import from tensorflow.keras
from tensorflow.keras.utils import to_categorical
from google.colab import drive
from sklearn.preprocessing import MultiLabelBinarizer
from collections import Counter
from sklearn.decomposition import LatentDirichletAllocation, NMF

# Mount Google Drive
drive.mount('/content/drive')

# Load the dataset
train_data = pd.read_csv('/content/drive/My Drive/drug_cleanedtrain.csv')
test_data = pd.read_csv('/content/drive/My Drive/drug_cleanedtest.csv')

# Preprocess text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ''  # Handle non-string values
    text = text.lower().strip()  # Lowercase and strip
    text = re.sub(f'[{string.punctuation}]', '', text)  # Remove punctuation
    text = re.sub('\d+', '', text)  # Remove numbers
    return text

train_data['cleanedReview'] = train_data['review'].apply(preprocess_text)

# Preprocess drug names to handle multiple names
def preprocess_drug_names(drug_names):
    return [name.strip() for name in drug_names.split(',')]

# Create a mapping of conditions to drug names with frequencies
def create_condition_to_drugs_mapping(df):
    condition_to_drugs = {}
    for condition, drugs in df.groupby('condition')['drugName']:
        drug_counter = Counter()
        for drug_list in drugs:
            drug_counter.update(preprocess_drug_names(drug_list))
        condition_to_drugs[condition.lower()] = drug_counter
    return condition_to_drugs

condition_to_drugs = create_condition_to_drugs_mapping(train_data)

# Function to find drugs for a given condition
def find_drugs_for_condition(condition, top_n=3):
    condition = condition.lower().strip()  # Ensure consistency
    drug_counter = condition_to_drugs.get(condition, Counter())

    # Get the top N drugs based on frequency
    top_drugs = drug_counter.most_common(top_n)
    return [drug for drug, _ in top_drugs]

# Function to get top reviews for a particular drug
def get_top_reviews_for_drug(df, drug_name, top_n=5):
    drug_reviews = df[df['drugName'].str.contains(drug_name, case=False, na=False)]
    top_reviews = drug_reviews.nlargest(top_n, 'rating')[['review', 'rating']]
    return top_reviews

# Function to highlight words based on topics
def highlight_words(text, words):
    highlighted_text = text
    for word in words:
        highlighted_text = re.sub(f'\\b({word})\\b', r'<mark>\1</mark>', highlighted_text, flags=re.IGNORECASE)
    return highlighted_text

# Function to extract topics using LDA and NMF
def extract_topics(reviews, num_topics=2, num_words=5):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf = vectorizer.fit_transform(reviews)

    # LDA
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(tfidf)
    lda_words = []
    for topic in lda.components_:
        lda_words.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]])

    # NMF
    nmf = NMF(n_components=num_topics, random_state=42)
    nmf.fit(tfidf)
    nmf_words = []
    for topic in nmf.components_:
        nmf_words.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-num_words:]])

    return lda_words, nmf_words

# Main execution
def main():
    # Get input from user
    condition = input("Enter the condition: ").strip()

    # Find top drugs for the condition
    top_drugs_for_condition = find_drugs_for_condition(condition)
    if not top_drugs_for_condition:
        print(f"No drugs found for the condition {condition}.")
        return

    print(f"Top drug names for condition {condition}: {top_drugs_for_condition}")

    for drug_name in top_drugs_for_condition:
        # Get top reviews for the drug
        top_reviews = get_top_reviews_for_drug(train_data, drug_name)
        if top_reviews.empty:
            print(f"No reviews found for the drug {drug_name}.")
            continue

        print(f"Top reviews for drug {drug_name}:\n", top_reviews)

        # Extract topics
        reviews_list = top_reviews['review'].tolist()
        lda_words, nmf_words = extract_topics(reviews_list)

        print(f"LDA topic words for {drug_name}:", lda_words)
        print(f"NMF topic words for {drug_name}:", nmf_words)

        # Highlight words in reviews
        highlighted_reviews = [highlight_words(review, lda_words[0] + nmf_words[0]) for review in reviews_list]

        print(f"Highlighted reviews for {drug_name}:")
        for review in highlighted_reviews:
            print(review)

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Enter the condition: left ventricular dysfunction
Top drug names for condition left ventricular dysfunction: ['carvedilol', 'coreg', 'enalapril']
Top reviews for drug carvedilol:
                                                   review  rating
15563  i had congestive heart failure 6 years ago in ...      10
20546  great when you start at a low dose then as doc...      10
25301  although it has only been three months my qual...      10
26551  my lvef was about 20 when i started on coreg f...      10
30998  been on this for a year smallest dose twice a ...      10
LDA topic words for carvedilol: [['years', 'thank', 'life', 'heart', 'coreg'], ['doctor', 'low', 'benefit', 'increases', 'dose']]
NMF topic words for carvedilol: [['years', 'life', 'thank', 'heart', 'coreg'], ['low', 'increases', 'start', 'great', 'dose']]
Highlighted reviews for carvedilol:
i had co