# Multi-Domain Sentiment Analysis

## importing the libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
tf.config.set_visible_devices([], 'GPU')

## importing the dataset

In [2]:
train_data = pd.read_csv('/Users/mohmmadmusaddique/Sentimental Analysis/Datasets/drugsComTrain_raw.tsv', on_bad_lines='skip', delimiter='\t')
test_data = pd.read_csv('/Users/mohmmadmusaddique/Sentimental Analysis/Datasets/drugsComTest_raw.tsv', on_bad_lines='skip', delimiter='\t')
data = pd.concat([train_data, test_data], axis=0)

## Data preprocessing

In [3]:
# Dropping the columns which are not required
data.drop(['Unnamed: 0', 'date'], axis=1, inplace=True)

In [4]:
data.head()

Unnamed: 0,drugName,condition,review,rating,usefulCount
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,27
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,192
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,17
3,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,10
4,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,37


In [5]:
# Sorting the data according to the drug name
def sort_data(data):
    cols = list(data.columns.values)
    cols.pop(cols.index('drugName'))
    data = data[cols+['drugName']]
    data = data.sort_values(by=['drugName'])
    return data

data = sort_data(train_data)

In [6]:
# Cleaning the data
def clean_data(data, col):
    data[col] = data[col].str.replace(' ', '_')
    data[col] = data[col].str.replace('-', '_')
    data[col] = data[col].str.replace('/', '_')
    data[col] = data[col].str.replace(',', '_')
    data[col] = data[col].str.replace('.', '_')
    data[col] = data[col].str.replace('(', '_')
    data[col] = data[col].str.replace(')', '_')
    data[col] = data[col].str.replace('[', '_')
    data[col] = data[col].str.replace(']', '_')
    data[col] = data[col].str.replace('<', '_')
    data[col] = data[col].str.replace('>', '_')
    data[col] = data[col].str.replace('=', '_')
    data[col] = data[col].str.lower()
    data[col] = data[col].str.replace('_____', '_')
    data[col] = data[col].str.replace('____', '_')
    data[col] = data[col].str.replace('___', '_')
    data[col] = data[col].str.replace('__', '_')
    return data

In [7]:
data = clean_data(data, 'drugName')
data = clean_data(data, 'condition')

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,condition,review,rating,date,usefulCount,drugName
9892,163935,bacterial_skin_infection,"""I have severe cracked skin on my hands. I&#0...",10.0,"December 7, 2009",6,a_+_d_cracked_skin_relief
18402,131173,otitis_media,"""It numbs the pain. It makes my ear feel heavi...",10.0,"September 23, 2009",20,a_b_otic
67773,70690,hiv_infection,"""I was diagnosed in January 2011. My own immun...",9.0,"May 1, 2017",6,abacavir_dolutegravir_lamivudine
68300,70730,hiv_infection,"""I recently found out about my positive status...",7.0,"January 3, 2015",26,abacavir_dolutegravir_lamivudine
137301,70740,hiv_infection,"""I was diagnosed in 2007 and had since never h...",10.0,"December 24, 2015",9,abacavir_dolutegravir_lamivudine


In [9]:
# Dropping the missing values
data.isnull().sum()

Unnamed: 0       0
condition      899
review           0
rating           0
date             0
usefulCount      0
drugName         0
dtype: int64

In [10]:
data.dropna(inplace=True)

In [11]:
data.isnull().sum()

Unnamed: 0     0
condition      0
review         0
rating         0
date           0
usefulCount    0
drugName       0
dtype: int64

## Preprocessing for Sentimental Analysis

In [12]:
corpus = data['review']

In [13]:
# Tokenizing the 'review' column into sequences of words using the Keras tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [14]:
tokenizer = Tokenizer(num_words=10000, oov_token='')
tokenizer.fit_on_texts(corpus)
X_train = tokenizer.texts_to_sequences(corpus)
X_train = pad_sequences(X_train, maxlen=120, truncating='post')

In [15]:
# New review sentimental analysis function
def predict_new_review(review):
    review = tokenizer.texts_to_sequences(review)
    review = pad_sequences(review, maxlen=120)
    answer = model.predict(review)
    if answer[0][0] > 0.5:
        print('Positive Review')
    else:
        print('Negative Review')

In [16]:
# Open the saved model and use it to predict the sentiment of a new review
import joblib
model = joblib.load('Sentimental_Analysis_Model.sav')

Keras model archive loading:
File Name                                             Modified             Size
config.json                                    2023-03-23 21:18:48         2921
metadata.json                                  2023-03-23 21:18:48           64
variables.h5                                   2023-03-23 21:18:48      3560824
Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......embedding
.........vars
............0
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
......

In [17]:
# Chceking the model
predict_new_review(['positive'])

2023-03-24 11:20:42.415042: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Positive Review


In [18]:
data['review'] = model.predict(X_train)



In [19]:
#exporting the data['review']
data['review'].to_csv('Sentimental_Analysis.csv')

In [20]:
data['review'] = pd.cut(data['review'], bins=[0, 0.5, 1], labels=['negative', 'positive'])

In [21]:
data.head()

Unnamed: 0.1,Unnamed: 0,condition,review,rating,date,usefulCount,drugName
9892,163935,bacterial_skin_infection,positive,10.0,"December 7, 2009",6,a_+_d_cracked_skin_relief
18402,131173,otitis_media,positive,10.0,"September 23, 2009",20,a_b_otic
67773,70690,hiv_infection,positive,9.0,"May 1, 2017",6,abacavir_dolutegravir_lamivudine
68300,70730,hiv_infection,negative,7.0,"January 3, 2015",26,abacavir_dolutegravir_lamivudine
137301,70740,hiv_infection,positive,10.0,"December 24, 2015",9,abacavir_dolutegravir_lamivudine


In [22]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [23]:
# Preprocess the categorical variables
le = LabelEncoder()
ohe = OneHotEncoder()

In [24]:
X_wide = ohe.fit_transform(data[['condition', 'review']]).toarray()

In [25]:
X_deep = np.hstack((ohe.fit_transform(data[['condition', 'review']]).toarray(), data[['rating', 'usefulCount']].values.reshape(-1, 2)))

In [26]:
# Preprocess the target variable
y = le.fit_transform(data['drugName'])

## Building W&DNN model

In [27]:
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model

In [28]:
# Define the wide and deep parts of the model
input_wide = Input(shape=(X_wide.shape[1],))
input_deep = Input(shape=(X_deep.shape[1],))
hidden1 = Dense(64, activation='relu')(input_deep)
dropout1 = Dropout(0.5)(hidden1)
hidden2 = Dense(32, activation='relu')(dropout1)
dropout2 = Dropout(0.5)(hidden2)
concat = Concatenate()([input_wide, dropout2])
output = Dense(len(le.classes_), activation='softmax')(concat)

In [29]:
# Compile the model
dnn = Model(inputs=[input_wide, input_deep], outputs=output)

In [30]:
dnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [31]:
# Train the model
dnn.fit([X_wide, X_deep], y, validation_split=0.2, epochs=25, batch_size=32)

Epoch 1/25


2023-03-24 11:25:32.343318: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x16bddbdc0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2023-03-24 11:25:32.343340: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Host, Default Version
2023-03-24 11:25:32.372282: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.


   5/4010 [..............................] - ETA: 2:17 - loss: 8.1504 - accuracy: 0.0000e+00 

2023-03-24 11:25:32.592861: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x298492080>

In [33]:
# Define the new input
condition = "birth_control"
review = "positive"
rating = 8
usefulCount = 5

# Preprocess the new input
new_wide = ohe.transform([[condition, review]]).toarray()
new_deep = np.hstack((ohe.transform([[condition, review]]).toarray(), np.array([rating, usefulCount]).reshape(-1, 2)))

# Predict the new input
prediction = dnn.predict([new_wide, new_deep])
predicted_drug_index = np.argmax(prediction)
predicted_drug = le.inverse_transform([predicted_drug_index])[0]
print('The predicted drug is: {}'.format(predicted_drug))

The predicted drug is: etonogestrel


## Export model using pikle

In [34]:
import pickle
filename = 'Drug_Prediction.sav'
pickle.dump(model, open(filename, 'wb'))

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers
......dense
.........vars
............0
............1
......dense_1
.........vars
............0
............1
......dropout
.........vars
......dropout_1
.........vars
......embedding
.........vars
............0
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
...optimizer
......vars
.........0
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-03-24 11:54:37         2921
metadata.json                                  2023-03-24 11:54:37           64
variables.h5                                   2023-03-24 11:54:37      1197448


In [35]:
#open the saved model and use it to predict the sentiment of a new review
# import joblib
# model = joblib.load('Sentimental_Analysis_Model.sav')
# predict_new_review(['positive'])