In [18]:
import pandas as pd
import numpy as np
import re
import string
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score


#load the datasets
train = pd.read_table('/home/mirko25/Scaricati/drugsCom_raw/drugsComTrain_raw.tsv')
test = pd.read_table('/home/mirko25/Scaricati/drugsCom_raw/drugsComTest_raw.tsv')


reviews = train ['review'] 
rating = train ['rating']

x_test = test['review']
test_reviews = test ['rating']

#train rating transformation
label = []
negative = 0
positive = 0
neutral = 0
for line in rating:
    if line <= 4 :
        label.append ('-1')
        negative +=1
    elif line >=7 :
        label.append('1')
        positive +=1
    else:
        label.append('0')
        neutral +=1
train['label'] = label

y_train = train ['label']


all = positive + negative + neutral
print ( 'Positive rating: ', round((positive/all)*100, 2), '%')
print ( 'Negative rating: ', round((negative/all)*100, 2), '%')
print ( 'Neutral rating: ', round((neutral/all)*100, 2), '%')


#test rating transformation
test_label = []

for line in test_reviews:
    if line <= 4 :
        test_label.append ('-1')
    elif line >=7 :
        test_label.append('1')
    else:
        test_label.append('0')
test['label'] = test_label

y_test = test ['label']


#pre-proccessing text

#remove punctuation
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

reviews = reviews.apply(lambda x: remove_punctuation(x))
x_test = x_test.apply (lambda x: remove_punctuation(x))

#lowercase
reviews = reviews.apply(lambda x: x.lower())
x_test = x_test.apply(lambda x: x.lower())


#remove numbers
reviews = reviews.apply(lambda x: re.sub("\d+", " ", x))
x_test = x_test.apply(lambda x: re.sub("\d+", " ", x))


#remove special characters
reviews = reviews.apply(lambda x: re.sub('[\W\_]', " ", x))
x_test = x_test.apply(lambda x: re.sub('[\W\_]', " ", x))

#transformation 
vect = CountVectorizer(analyzer = 'word', ngram_range = (1,3), max_df = 0.9)
mul_reviews = vect.fit_transform(reviews)
mul_x_test = vect.transform(x_test)

#create and fit the model
model = MultinomialNB()
model.fit(mul_reviews, y_train)

prediction_test = model.predict(mul_x_test)

#calculate accuracy and Cohen's Kappa
acc_test = accuracy_score (y_test, prediction_test)
k_test = cohen_kappa_score (y_test, prediction_test)

acc = round(acc_test, 2)
k = round(k_test, 2)
print("\n")
print("Multinomial Naive Bayes")
print ("Accuracy: {}" .format(acc))
print ("Cohen's Kappa: {}"  .format(k))

#In-domain sentiment analysis
df = DataFrame ({'Aspect' : ["Overall Rating"],
                'Source' : ["Drugs.com"],
                'Accuracy': [acc],
                "Cohen's Kappa" : [k]})
df.index = [""]
df

Positive rating:  66.25 %
Negative rating:  24.85 %
Neutral rating:  8.9 %


Multinomial Naive Bayes
Accuracy: 0.86
Cohen's Kappa: 0.67


Unnamed: 0,Aspect,Source,Accuracy,Cohen's Kappa
,Overall Rating,Drugs.com,0.86,0.67


In [7]:
from sklearn import metrics

#show confusion matrix for MultinomialNB
confusion_matrix = metrics.confusion_matrix(y_test, prediction_test)
print ('\n {}' .format (confusion_matrix) )


 [[ 9990     0  3507]
 [  734  1218  2877]
 [  296     4 35140]]


In [17]:
from sklearn.naive_bayes import BernoulliNB

#try with Bernoulli NB model

vectB = CountVectorizer(analyzer = 'word', ngram_range = (1,3), max_df = 0.9, binary = True)
ber_reviews = vectB.fit_transform(reviews)
ber_x_test = vectB.transform(x_test)

modelB = BernoulliNB()
modelB.fit(ber_reviews, y_train)

p_testB = modelB.predict(ber_x_test)

accuracy_B = accuracy_score (y_test, p_testB)
kappa_B = cohen_kappa_score (y_test, p_testB)

accB = round(accuracy_B, 2)
kB = round(kappa_B, 2)
print("Bernoulli Naive Bayes")
print ("Accuracy: {}" .format(accB))
print ("Cohen's Kappa: {}"  .format(kB))

#In-domain sentiment analysis
df = DataFrame ({'Aspect' : ["Overall Rating"],
                'Source' : ["Drugs.com"],
                'Accuracy': [accB],
                "Cohen's Kappa" : [kB]})
df.index = [""]
df

Bernoulli Naive Bayes
Accuracy: 0.77
Cohen's Kappa: 0.42


Unnamed: 0,Aspect,Source,Accuracy,Cohen's Kappa
,Overall Rating,Drugs.com,0.77,0.42


In [19]:

#show confusion matrix for BernoulliNB
confusion_matrix = metrics.confusion_matrix(y_test, p_testB)
print ('\n {}' .format (confusion_matrix) )


 [[ 5967     0  7530]
 [  666   103  4060]
 [   58     4 35378]]
