In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
import re

In [None]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Pac

True

In [None]:
# Needed for Lingustic Analysis
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# We will visualize the messages with a word cloud
from wordcloud import WordCloud


# Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

# Import Tf-idf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Import the Label Encoder
from sklearn.preprocessing import LabelEncoder

# Import the train test split
from sklearn.model_selection import train_test_split

# To evaluate our model
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [None]:
df = pd.read_csv('/content/gdrive/My Drive/selected_dataset/drugsComTrain_raw.csv')
df.head()

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37


In [None]:
# Create Confidence Interval Function
def confidence_interval (data, ci_percent):
  data = np.array(data) # Makes sure our data is in a numpy array
  mean = np.mean(data)
  n = len(data)
  stderr = stats.sem(data)
  interval = stderr * stats.t.ppf((1 + ci_percent) / 2., n - 1)
  return (mean, mean - interval, mean + interval)


def condition_compare (df, condition_id, ci_percent, sample_size_cutoff):
  output_names = ["Drug Name", "Sample Mean", "Lower Bound", "Upper Bound", "Sample Size"]
  drug_compare = []
  data = df[df.condition == condition_id]
  for drug in data.drugName.unique():
    one_drug = data[data.drugName == drug].rating
    if one_drug.size > sample_size_cutoff:
      mean, ilower, iupper= confidence_interval(one_drug, ci_percent)
      entry = [drug, mean, ilower, iupper, one_drug.size]
      drug_compare.append(entry)
  return pd.DataFrame(drug_compare, columns=output_names)


df2 = condition_compare(df, "Depression", 0.95, 10).sort_values(by="Sample Mean", ascending=False)
df2.head(3)

Unnamed: 0,Drug Name,Sample Mean,Lower Bound,Upper Bound,Sample Size
62,Niacin,9.857143,9.647474,10.066812,14
47,Tramadol,9.288462,8.934,9.642923,52
68,Clomipramine,9.181818,8.10616,10.257476,11


In [None]:
df_train = pd.read_csv('/content/gdrive/My Drive/selected_dataset/drugsComTrain_raw.csv')
df_test = pd.read_csv('/content/gdrive/My Drive/selected_dataset/drugsComTest_raw.csv')

df_main = pd.concat([df_train, df_test], axis=0)
df_main.head(3)

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17


In [None]:
pd.set_option('display.width', 1000)
# Make the letters lower case and tokenize the words
tokenized_messages = df_main['review'].str.lower().apply(word_tokenize)

# Print the tokens to see how it looks like
print(tokenized_messages)

0        [``, it, has, no, side, effect, ,, i, take, it...
1        [``, my, son, is, halfway, through, his, fourt...
2        [``, i, used, to, take, another, oral, contrac...
3        [``, this, is, my, first, time, using, any, fo...
4        [``, suboxone, has, completely, turned, my, li...
                               ...                        
53761    [``, i, have, taken, tamoxifen, for, 5, years,...
53762    [``, i, &, #, 039, ;, ve, been, taking, lexapr...
53763    [``, i, &, #, 039, ;, m, married, ,, 34, years...
53764    [``, i, was, prescribed, nucynta, for, severe,...
53765                         [``, it, works, !, !, !, '']
Name: review, Length: 215063, dtype: object


In [None]:
# Define a function to returns only alphanumeric tokens
def alpha(tokens):
    """This function removes all non-alphanumeric characters"""
    alpha = []
    for token in tokens:
        if str.isalpha(token) or token in ['n\'t','won\'t']:
            if token=='n\'t':
                alpha.append('not')
                continue
            elif token == 'won\'t':
                alpha.append('wont')
                continue
            alpha.append(token)
    return alpha

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(alpha)

print(tokenized_messages)

0        [it, has, no, side, effect, i, take, it, in, c...
1        [my, son, is, halfway, through, his, fourth, w...
2        [i, used, to, take, another, oral, contracepti...
3        [this, is, my, first, time, using, any, form, ...
4        [suboxone, has, completely, turned, my, life, ...
                               ...                        
53761    [i, have, taken, tamoxifen, for, years, side, ...
53762    [i, ve, been, taking, lexapro, escitaploprgram...
53763    [i, m, married, years, old, and, i, have, no, ...
53764    [i, was, prescribed, nucynta, for, severe, pai...
53765                                          [it, works]
Name: review, Length: 215063, dtype: object


In [None]:
# Define a function to remove stop words
def remove_stop_words(tokens):
    """This function removes all stop words in terms of nltk stopwords"""
    no_stop = []
    for token in tokens:
        if token not in stopwords.words('english'):
            no_stop.append(token)
    return no_stop

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(remove_stop_words)

print(tokenized_messages)

0        [side, effect, take, combination, bystolic, mg...
1        [son, halfway, fourth, week, intuniv, became, ...
2        [used, take, another, oral, contraceptive, pil...
3        [first, time, using, form, birth, control, gla...
4        [suboxone, completely, turned, life, around, f...
                               ...                        
53761    [taken, tamoxifen, years, side, effects, sever...
53762    [taking, lexapro, escitaploprgram, since, febr...
53763    [married, years, old, kids, taking, pill, hass...
53764    [prescribed, nucynta, severe, pain, taking, pi...
53765                                              [works]
Name: review, Length: 215063, dtype: object


In [None]:
# Define a function to lemmatization
def lemmatize(tokens):
    """This function lemmatize the messages"""
    # Initialize the WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    # Create the lemmatized list
    lemmatized = []
    for token in tokens:
            # Lemmatize and append
            lemmatized.append(lemmatizer.lemmatize(token))
    return " ".join(lemmatized)

# Apply our function to tokens
tokenized_messages = tokenized_messages.apply(lemmatize)

print(tokenized_messages)

0        side effect take combination bystolic mg fish oil
1        son halfway fourth week intuniv became concern...
2        used take another oral contraceptive pill cycl...
3        first time using form birth control glad went ...
4        suboxone completely turned life around feel he...
                               ...                        
53761    taken tamoxifen year side effect severe sweati...
53762    taking lexapro escitaploprgram since february ...
53763    married year old kid taking pill hassle decide...
53764    prescribed nucynta severe pain taking pill rus...
53765                                                 work
Name: review, Length: 215063, dtype: object


In [None]:
# Replace the columns with tokenized messages
df_main['review'] = tokenized_messages

# Display the first five rows
display(df_main.head(10))

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,side effect take combination bystolic mg fish oil,9,20-May-12,27
1,95260,Guanfacine,ADHD,son halfway fourth week intuniv became concern...,8,27-Apr-10,192
2,92703,Lybrel,Birth Control,used take another oral contraceptive pill cycl...,5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,first time using form birth control glad went ...,8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,suboxone completely turned life around feel he...,9,27-Nov-16,37
5,155963,Cialis,Benign Prostatic Hyperplasia,day started work rock hard erection however ex...,2,28-Nov-15,43
6,165907,Levonorgestrel,Emergency Contraception,pulled cummed bit took plan b hour later took ...,1,7-Mar-17,5
7,102654,Aripiprazole,Bipolar Disorde,abilify changed life hope zoloft clonidine fir...,10,14-Mar-15,32
8,74811,Keppra,Epilepsy,nothing problem keppera constant shaking arm a...,1,9-Aug-16,11
9,48928,Ethinyl estradiol / levonorgestrel,Birth Control,pill many year doctor changed rx chateal effec...,8,8-Dec-16,1


In [None]:
# Select the features and the target
X = df_main['review']
y = df_main['rating']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=34, stratify=y)

In [None]:

# Create the tf-idf vectorizer
vectorizer = TfidfVectorizer(strip_accents='ascii')
#x = vectorizer.fit_transform(df['Message'].values.astype('U'))
# First fit the vectorizer with our training set
tfidf_train = vectorizer.fit_transform(X_train)

# Now we can fit our test data with the same vectorizer
tfidf_test = vectorizer.transform(X_test)

In [None]:
# Initialize the Multinomial Naive Bayes classifier
nb = MultinomialNB()

# Fit the model
nb.fit(tfidf_train, y_train)

# Print the accuracy score
print("Accuracy:",nb.score(tfidf_test, y_test))

Accuracy: 0.3640527282449492


In [None]:
# Predict the labels
y_pred = nb.predict(tfidf_test)

# Print the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix\n")
print(cm)

# Print the Classification Report
cr = classification_report(y_test, y_pred)
print("\n\nClassification Report\n")
print(cr)




Confusion Matrix

[[ 1857     0     0     0     0     0     0     2    44  3881]
 [  320     0     0     0     0     0     0     7    26  1500]
 [  241     0     0     0     0     0     0     5    33  1465]
 [  134     0     0     0     0     0     0     2    27  1171]
 [  157     0     0     0     1     0     0     2    47  1938]
 [   77     0     0     0     0     0     0     4    48  1563]
 [   61     0     0     0     0     0     1     6   104  2337]
 [   80     0     0     0     0     0     0    31   163  4735]
 [   59     0     0     0     0     0     0     2   307  6974]
 [   72     0     0     0     0     0     0     1    66 13462]]


Classification Report

              precision    recall  f1-score   support

           1       0.61      0.32      0.42      5784
           2       0.00      0.00      0.00      1853
           3       0.00      0.00      0.00      1744
           4       0.00      0.00      0.00      1334
           5       1.00      0.00      0.00      2145
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
