### Import the Libraries

In [1]:

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import tensorflow as tf 
from tensorflow.keras import layers 
from tensorflow.keras.models import Sequential

### Reading the dataset

In [2]:
df = pd.read_csv("drug_review_test.csv")

### Exploratory Data Analysis

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,0,163740,Mirtazapine,depression,"""i've tried a few antidepressants over the yea...",10.0,"February 28, 2012",22,68
1,1,206473,Mesalamine,"crohn's disease, maintenance","""my son has crohn's disease and has done very ...",8.0,"May 17, 2009",17,48
2,2,39293,Contrave,weight loss,"""contrave combines drugs that were used for al...",9.0,"March 5, 2017",35,143
3,3,97768,Cyclafem 1 / 35,birth control,"""i have been on this birth control for one cyc...",9.0,"October 22, 2015",4,149
4,4,208087,Zyclara,keratosis,"""4 days in on first 2 weeks. using on arms an...",4.0,"July 3, 2014",13,60


In [4]:
df.isnull().sum()

Unnamed: 0       0
patient_id       0
drugName         0
condition        0
review           0
rating           0
date             0
usefulCount      0
review_length    0
dtype: int64

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,patient_id,rating,usefulCount,review_length
count,46108.0,46108.0,46108.0,46108.0,46108.0
mean,4741.710289,116604.217988,6.980979,28.50527,95.415156
std,2841.941628,66906.119983,3.25699,37.222459,37.716939
min,0.0,0.0,1.0,0.0,31.0
25%,2305.0,59070.75,5.0,6.0,63.0
50%,4610.0,116932.5,8.0,16.0,95.0
75%,7118.0,174279.0,10.0,37.0,130.0
max,9999.0,232284.0,10.0,949.0,1162.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46108 entries, 0 to 46107
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     46108 non-null  int64  
 1   patient_id     46108 non-null  int64  
 2   drugName       46108 non-null  object 
 3   condition      46108 non-null  object 
 4   review         46108 non-null  object 
 5   rating         46108 non-null  float64
 6   date           46108 non-null  object 
 7   usefulCount    46108 non-null  int64  
 8   review_length  46108 non-null  int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 3.2+ MB


In [7]:
df.columns

Index(['Unnamed: 0', 'patient_id', 'drugName', 'condition', 'review', 'rating',
       'date', 'usefulCount', 'review_length'],
      dtype='object')

### Extracting columns review and ratings

In [8]:
X = df["review"]
y = df["rating"]

### Splitting the dataset

In [9]:
from sklearn.model_selection import train_test_split 
X_train , X_test , y_train , y_test = train_test_split(X ,y ,test_size=0.2 ,random_state=42)

### Preprocessing

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train = X_train.astype(str)
X_test = X_test.astype(str)


In [11]:
max_words = 10000
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(X_train)
tokenizer.fit_on_texts(X_test)

In [12]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [13]:
max_len =100
X_train_pad = pad_sequences(X_train_seq , maxlen = max_len)
X_test_pad = pad_sequences(X_test_seq , maxlen = max_len)

### GRU Model

In [14]:
model = Sequential([
    layers.Embedding(input_dim=max_words , output_dim=64 , input_length = max_len),
    layers.GRU(64 ,activation='relu'),
    layers.Dense(128,activation='relu'),
    layers.Dense(1 ,activation='softmax')
])
model.compile(optimizer='adam' , loss='binary_crossentropy' , metrics=['accuracy'])

In [15]:
model.fit(X_train_pad , y_train ,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c84a1bb040>