In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [None]:
# Load the dataset
train_file_path = '/content/drive/My Drive/archive drug/drugsComTrain_raw.csv'
test_file_path = '/content/drive/My Drive/archive drug/drugsComTest_raw.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)
df = pd.concat([train_data, test_data])

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lower case
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['clean_review'] = df['review'].apply(preprocess_text)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vectorizer.fit_transform(df['clean_review'])

# Fit LDA model
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topics = lda_model.fit_transform(X)

# Fit NMF model
nmf_model = NMF(n_components=5, random_state=42)
nmf_topics = nmf_model.fit_transform(X)

# Extract feature names
feature_names = vectorizer.get_feature_names_out()

In [None]:
import numpy as np

# Function to get the top words from each topic
def get_top_words(model, feature_names, n_top_words=10):
    top_words = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words[topic_idx] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    return top_words

# Get top words from LDA and NMF
lda_top_words = get_top_words(lda_model, feature_names)
nmf_top_words = get_top_words(nmf_model, feature_names)

# Combine top words from both models
combined_top_words = {}
for topic_idx in range(len(lda_top_words)):
    combined_top_words[topic_idx] = list(set(lda_top_words[topic_idx] + nmf_top_words[topic_idx]))

In [None]:
def get_helpful_words(review, vectorizer, lda_model, nmf_model, top_words):
    # Preprocess the review
    cleaned_review = preprocess_text(review)
    review_vector = vectorizer.transform([cleaned_review])

    # Get topic distributions
    lda_distribution = lda_model.transform(review_vector)
    nmf_distribution = nmf_model.transform(review_vector)

    # Identify the most relevant topic
    lda_topic = np.argmax(lda_distribution)
    nmf_topic = np.argmax(nmf_distribution)

    # Combine the words from the most relevant topics of LDA and NMF
    helpful_words = set(top_words[lda_topic] + top_words[nmf_topic])

    return list(helpful_words)

# Example usage
review = "The medication worked well, but it caused some side effects."
helpful_words = get_helpful_words(review, vectorizer, lda_model, nmf_model, combined_top_words)
print(f"Helpful words: {helpful_words}")

Helpful words: ['days', 'depression', 'period', 'feel', 'months', 'mg', 'day', 'ive', 'took', 'birth', 'month', 'control', 'life', 'sleep', 'works', 'years', 'taking', 'periods', 'im', 'hours', 'like', 'anxiety', 'medication', 'acne', 'time', 'bleeding', 'pain', 'pill', 'effects']


In [None]:
# Example usage
review = input()
helpful_words = get_helpful_words(review, vectorizer, lda_model, nmf_model, combined_top_words)
print(f"Helpful words: {helpful_words}")

This drug was really very useful and it cured my condition
Helpful words: ['days', 'depression', 'period', 'feel', 'months', 'mg', 'day', 'ive', 'took', 'birth', 'month', 'control', 'life', 'sleep', 'works', 'years', 'taking', 'periods', 'im', 'hours', 'like', 'anxiety', 'medication', 'acne', 'time', 'bleeding', 'pain', 'pill', 'effects']


In [None]:
import joblib
# Save the LDA model
joblib.dump(lda_model, '/content/drive/My Drive/drug/lda_model.joblib')

# Save the NMF model
joblib.dump(nmf_model, '/content/drive/My Drive/drug/nmf_model.joblib')

# Save the vectorizer
joblib.dump(vectorizer, '/content/drive/My Drive/drug/vectorizer.joblib')

['/content/drive/My Drive/drug/vectorizer.joblib']