# Sentiment classifiers

In [1]:
import re
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import CategoricalNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### sentiment classifier per each category

__paths to change__

In [2]:
# input variables
path = './datos/Multi Domain Sentiment/'

# notebook variables
books_path = path + 'books/'
dvd_path = path + 'dvd/'
electronics_path = path + 'electronics/'
kitchen_path = path + 'kitchen/'

__Read documents methods__

In [3]:
def get_testing(path: str) -> pd.DataFrame:
    testing = pd.read_csv(path + 'unlabeled.review', delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    testing['label'] = testing.label.map({'negative':0, 'positive':1})
    return testing

In [4]:
def get_traing(path: str) -> pd.DataFrame:
    negative = pd.read_csv(path + 'negative.review', delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    positive = pd.read_csv(path + 'positive.review', delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    traning = pd.concat([negative, positive], axis=0, ignore_index=True)
    traning['label'] = traning.label.map({'negative':0, 'positive':1})
    return traning

In [8]:
def remove_characters(word: str) -> str:
    # remove control characters
    word = re.sub('&.*;', '', word)
    # remove problematic characters
    word = re.sub('-', '_', word)
    word = re.sub('[^a-zA-Z0-9_]', '', word)
    word = re.sub('[0-9]+', '_num_', word)
    word = re.sub('_+', '_', word)
    # remove characters that starts with or ends with _
    word = re.sub('^_+', '', word)
    word = re.sub('_$', '', word)
    return word

In [9]:
porter_stemmer = PorterStemmer()
def preprocessing(document: str) -> str:
    words = document.split()
    result = ''
    for word in words:
        item, cant = word.split(':')
        item = remove_characters(item)
        item = '_'.join([porter_stemmer.stem(word) for word in item.split('_')])
        i, c = 0, int(cant)
        while i < c:
            result = result + ' ' + item
            i += 1
    return result

In [10]:
traning = shuffle(get_traing(books_path), random_state=0)
traning['clean'] = traning.data.apply(preprocessing)
traning.sample(10)

Unnamed: 0,data,label,clean
1290,coherence_this:1 recommend_it:1 topic:1 additi...,1,coher_thi recommend_it topic addit_to gener c...
1404,sort:1 fortunately_in:1 the_rise:2 thought_tha...,1,sort fortun_in the_rise the_rise thought_that...
1426,writer_mayle:1 again_to:1 mayle's_breakthrough...,1,writer_mayl again_to mayl_breakthrough home d...
1810,on_every:1 uncommon:1 only:2 mr:1 over_a:1 fol...,1,on_everi uncommon onli onli mr over_a follow ...
307,essays:1 feminist_essays:1 of_feminist:1 is_wo...,0,essay feminist_essay of_feminist is_wors coll...
1703,really:1 everything:1 books_says:1 really_work...,1,realli everyth book_say realli_work when_you ...
951,description:1 your:1 anyone:1 book_anymore:1 c...,0,descript your anyon book_anymor color_photogr...
318,disturbing:1 worrying_about:1 mean:1 thought_n...,0,disturb worri_about mean thought_next neighbo...
153,incident:1 revolve:1 cop_stories:1 the_chapter...,0,incid revolv cop_stori the_chapter tie end_is...
1653,hands_and:1 <num>_months:1 whenever_he:1 on_hi...,1,hand_and num_month whenev_he on_hi mama_wave ...


In [11]:
testing = shuffle(get_testing(books_path), random_state=0)
testing['clean'] = testing.data.apply(preprocessing)
testing.sample(10)

Unnamed: 0,data,label,clean
2241,i:4 dog:1 the_dog:1 the_mind:1 both:1 how-to.:...,1,i i i i dog the_dog the_mind both how_to beyo...
1002,reading_this:1 mean:1 down:1 i_kept:1 he:5 par...,0,read_thi mean down i_kept he he he he he part...
1621,he:5 of_mcclellan's:1 fascinating_the:1 consta...,1,he he he he he of_mcclellan fascin_the consta...
2908,which:1 find_another:1 around_it:1 diary_i:1 w...,0,which find_anoth around_it diari_i what_happe...
661,"be_""wowwed"":1 after_reading:1 about:1 read:1 t...",1,be_wow after_read about read the_beach page d...
373,comments:1 other_in:1 found:1 problem_is:1 he:...,0,comment other_in found problem_is he wayovera...
3343,made_it:1 to_lookup:1 tried_to:1 i_like:1 book...,0,made_it to_lookup tri_to i_like book_becaus t...
739,right_there:1 cried_multiple:1 with_all:1 doll...,1,right_there cri_multipl with_all dollar dolla...
2626,union_the:1 the_company:1 was:2 boss:2 simply:...,0,union_the the_compani wa wa boss boss simpli ...
1559,your:1 people_have:1 about_<dash-num>:1 pages_...,0,your peopl_have about_dash_num page_im i_seem...


In [12]:
vectorizer = CountVectorizer()
traning_data = vectorizer.fit_transform(traning.clean.values)
testing_data = vectorizer.transform(testing.clean.values)

In [23]:
len(vectorizer.get_feature_names_out())

169641

In [14]:
clf = MultinomialNB()
clf.fit(traning_data, traning.label.values)

In [15]:
predictions = clf.predict(testing_data)
print('Precision score: ', format(precision_score(testing.label.values, predictions,average='macro')))
print('Recall score: ', format(recall_score(testing.label.values, predictions,average='macro')))
print('F1 score: ', format(f1_score(testing.label.values, predictions,average='macro')))
print('Accuracy score: ', format(accuracy_score(testing.label.values, predictions)))

Precision score:  0.8368396694255156
Recall score:  0.8328657829801103
F1 score:  0.8316437462520466
Accuracy score:  0.832026875699888
