In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
import joblib

In [None]:
# Load the dataset
train_file_path = '/content/drive/My Drive/archive drug/drugsComTrain_raw.csv'
test_file_path = '/content/drive/My Drive/archive drug/drugsComTest_raw.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)
data = pd.concat([train_data, test_data])

In [None]:
# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply preprocessing
data['review'] = data['review'].apply(preprocess_text)

In [None]:
# Preprocess the data
# Encode the ratings if they are not numerical
label_encoder = LabelEncoder()
data['rating'] = label_encoder.fit_transform(data['rating'])

# Split the dataset into train and test sets
X = data['review']
y = data['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import hstack

# Initialize the vectorizers
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
count_vectorizer = CountVectorizer(max_features=5000)

# Fit and transform the data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# Combine the features
X_train_combined = hstack([X_train_tfidf, X_train_count])
X_test_combined = hstack([X_test_tfidf, X_test_count])

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Convert sparse matrix to dense
X_train_combined = X_train_combined.toarray()
X_test_combined = X_test_combined.toarray()

# Build the CNN model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_combined.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='linear'))  # For regression output

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_absolute_error'])

# Train the model
model.fit(X_train_combined, y_train, epochs=5, batch_size=32, validation_split=0.2)

In [None]:
from sklearn.metrics import mean_absolute_error
predictions = model.predict(X_test_combined)
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Absolute Error: {mae}")

In [None]:
import joblib

# Save the model
model.save('/content/drive/My Drive/drug/rating_predict.h5')

# Save the TF-IDF Vectorizer
with open('/content/drive/My Drive/drug/tfidf_vectorizer_rating.pkl', 'wb') as f:
    joblib.dump(tfidf_vectorizer, f)

# Save the CountVectorizer
with open('/content/drive/My Drive/drug/count_vectorizer_rating.pkl', 'wb') as f:
    joblib.dump(count_vectorizer, f)

In [None]:
# Make predictions on the test set
predictions = model.predict(X_test_combined)

# Show some predictions
for i in range(5):
    print(f"Review: {X_test.iloc[i]}")
    print(f"Actual Rating: {y_test.iloc[i]}")
    print(f"Predicted Rating: {predictions[i][0]}\n")

# Get user input and show prediction
while True:
    user_input = input("Enter a review to predict its rating (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break

    # Load the vectorizers
    with open('/content/drive/My Drive/drug/tfidf_vectorizer_rating.pkl', 'rb') as f:
        tfidf_vectorizer = pickle.load(f)

    with open('/content/drive/My Drive/drug/count_vectorizer_rating.pkl', 'rb') as f:
        count_vectorizer = pickle.load(f)

    # Feature extraction for user input
    review_tfidf = tfidf_vectorizer.transform([user_input])
    review_count = count_vectorizer.transform([user_input])
    review_combined = hstack([review_tfidf, review_count]).toarray()

    # Predict rating
    prediction = model.predict(review_combined)
    predicted_rating = prediction[0][0]

    print(f"Predicted Rating: {predicted_rating:.2f}")