In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Embedding, GlobalMaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import re
import pickle

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
train_file_path = '/content/drive/My Drive/drugsComTrain_raw.csv'
test_file_path = '/content/drive/My Drive/drugsComTest_raw.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)
data = pd.concat([train_data, test_data])

In [None]:
# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing
data['review'] = data['review'].apply(preprocess_text)

In [None]:
# Fill missing values
data = data.fillna('')

# Encode the labels
le_drug = LabelEncoder()
data['drugName'] = le_drug.fit_transform(data['drugName'])

le_condition = LabelEncoder()
data['condition'] = le_condition.fit_transform(data['condition'])

# Split the data
X = data['review']
y = data[['drugName', 'condition']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from scipy.sparse import hstack

# Initialize the vectorizers
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
count_vectorizer = CountVectorizer(max_features=5000)

# Fit and transform the data
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

cv_train = count_vectorizer.fit_transform(X_train)
cv_test = count_vectorizer.transform(X_test)

# Combine the features
X_train_combined = hstack([tfidf_train, cv_train])
X_test_combined = hstack([tfidf_test, cv_test])

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Embedding, GlobalMaxPooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

# Assuming X_train, X_test, y_train, y_test are already defined

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_length = 500
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length)

# Create the CNN model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=max_length),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.5),
    GlobalMaxPooling1D(),  # Replaces Flatten
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(y_train.shape[1], activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7fa6a78b3520>

In [None]:
def get_reviews_and_conditions(drug_name):
    try:
        # Transform the drug name to its encoded form
        drug_name_encoded = le_drug.transform([drug_name])[0]

        # Filter the dataset for the given drug name
        filtered_data = data[data['drugName'] == drug_name_encoded]

        # Get the first 5 unique reviews and their associated conditions
        unique_conditions = filtered_data['condition'].unique()[:5]
        reviews = filtered_data['review'].values[:5]

        # Inverse transform the condition codes to their original names
        conditions = le_condition.inverse_transform(unique_conditions)

        return conditions, reviews
    except ValueError:
        return "Drug name not found in the dataset", []

In [None]:
# Simulate user input
user_input = 'Aspirin'
conditions, reviews = get_reviews_and_conditions(user_input)
print(f"Conditions associated with {user_input}: {conditions}")
print(f"Reviews for {user_input}: {reviews}")

In [None]:
# Simulate user input
user_input = input()
conditions, reviews = get_reviews_and_conditions(user_input)
print(f"Conditions associated with {user_input}: {conditions}")
print(f"Reviews for {user_input}: {reviews}")

In [None]:
import joblib

# Save the model
model.save('/content/drive/My Drive/drug/drug_exploration.h5')

# Save the tokenizer
with open('/content/drive/My Drive/drug/tokenizer_explore.pkl', 'wb') as file:
    joblib.dump(tokenizer, file)

# Save the label encoders
with open('/content/drive/My Drive/drug/le_drug.pkl', 'wb') as file:
    joblib.dump(le_drug, file)

with open('/content/drive/My Drive/drug/le_condition.pkl', 'wb') as file:
    joblib.dump(le_condition, file)