# Sentiment classifiers

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.stem.porter import PorterStemmer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

### sentiment classifier per each category

__paths to change__

In [2]:
# input variables
path = './datos/Multi Domain Sentiment/'

# notebook variables
books_path = path + 'books/'
dvd_path = path + 'dvd/'
electronics_path = path + 'electronics/'
kitchen_path = path + 'kitchen/'

__Read documents methods__

In [3]:
def get_testing(path: str) -> pd.DataFrame:
    testing = pd.read_csv(path + 'unlabeled.review', delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    testing['label'] = testing.label.map({'negative':0, 'positive':1})
    return testing

In [4]:
def get_traing(path: str) -> pd.DataFrame:
    negative = pd.read_csv(path + 'negative.review', delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    positive = pd.read_csv(path + 'positive.review', delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    traning = pd.concat([negative, positive], axis=0, ignore_index=True)
    traning['label'] = traning.label.map({'negative':0, 'positive':1})
    return traning

In [5]:
def get_corpus(paths: list) -> pd.DataFrame:
    corpus = pd.DataFrame()
    for path in paths:
        negative = pd.read_csv(path + 'negative.review', delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
        positive = pd.read_csv(path + 'positive.review', delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
        unlabeled = pd.read_csv(path + 'unlabeled.review', delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
        corpus = pd.concat([corpus, negative, positive, unlabeled], axis=0, ignore_index=True)
    corpus['label'] = corpus.label.map({'negative':0, 'positive':1})
    return corpus

In [6]:
def remove_characters(word: str) -> str:
    # url tag replace
    url_list = ['http', 'www']
    for url in url_list:
        if url in word: 
            word = '<url>'

    # replace &something; characters
    mt = re.match(r'&.*;($|.*)', word)
    if mt: 
        word = word.replace(mt[0], '')

    # remove problematic characters
    remove_list = ['&quot;', '\x1a', '[', ']', '(', ')', '!', '¡', '"', '\'', '*', ',', '.', '/', '\\', '#', '&', '+', ';', '?', '@', '%', '$', '`', '=', '~', '�']
    for remove in remove_list:
        word = word.replace(remove, '')

    # replace dummy characters
    word = word.replace('-_', '_').replace('_-', '_')
    while '--' in word:
        word = word.replace('--', '-')
    while '__' in word:
        word = word.replace('__', '_')

    # replace years
    mt = re.match(r'.*([1-3][0-9]{3})', word)
    if mt: 
        word = word.replace(mt[0], '<year>')

    # remove characters that starts with - or _
    mt = re.match(r'-.*', word)
    if mt: 
        word = word.replace(mt[0], '', 1)
    mt = re.match(r'_.*', word)
    if mt: 
        word = word.replace(mt[0], '', 1)

    # remove characters that ends with - or _
    mt = re.match(r'.*-$', word)
    if mt: 
        word = re.sub(r'.$', '', word)
    mt = re.match(r'.*_$', word)
    if mt: 
        word = re.sub(r'.$', '', word)

    # replace numbers with <num> tag
    fa = re.findall('[0-9]+', word)
    if fa:
        fa = sorted(fa, reverse=True)
        for num in fa:
            word = word.replace(num, '<num>')
    fa = re.findall('[0-9]+', word)
    if fa:
        for num in fa:
            word = word.replace(num, '')
    word = word.replace('<num><num>', '<num>')
    return word

In [7]:
porter_stemmer = PorterStemmer()
def preprocessing(document: str) -> str:
    words = document.split()
    result = ''
    for word in words:
        item, cant = word.split(':')
        item = remove_characters(item)
        item = '_'.join([porter_stemmer.stem(word) for word in item.split('_')])
        i, c = 0, int(cant)
        while i < c:
            result = result + ' ' + item
            i += 1
    return result

In [8]:
traning = shuffle(get_traing(books_path), random_state=0)
print(traning.shape)
traning['clean'] = traning.data.apply(preprocessing)
traning.head()

(2000, 2)


Unnamed: 0,data,label,clean
405,he:7 someone_like:1 guy:2 this_big:1 doubt_put...,0,he he he he he he he someon_like guy guy thi_...
1190,you'll:1 reading:1 good:1 bound:1 james_allen:...,1,youll read good bound jame_allen jame like li...
1132,i:2 baby-boomers)_are:1 things_from:1 will_kno...,1,i i baby-boom_are thing_from will_know way_i ...
731,party_and:1 only:1 like:9 must_try:1 one:2 one...,0,parti_and onli like like like like like like ...
1754,more_books:1 bad_there:1 all:1 all_wish:1 mitf...,1,more_book bad_there all all_wish mitford as_u...


In [9]:
testing = shuffle(get_testing(books_path), random_state=0)
print(testing.shape)
testing['clean'] = testing.data.apply(preprocessing)
testing.head()

(4465, 2)


Unnamed: 0,data,label,clean
4263,circle_but:1 vs:1 at_all:1 and_valley:1 the_co...,1,circl_but vs at_all and_valley the_conclus mi...
2475,made_it:1 before_i:1 i'd_better:1 while:1 the_...,0,made_it befor_i id_better while the_foreign a...
1206,is_no:1 providing_valuable:1 about:1 reviewer:...,1,is_no provid_valuabl about review prove_the u...
1464,teen_but:1 get:2 reciepes!!simple_fun:1 made:1...,1,teen_but get get reciepessimpl_fun made veget...
2550,i:8 enough_parts:1 sort:2 middle:1 more_worthw...,0,i i i i i i i i enough_part sort sort middl m...


In [10]:
vectorizer = CountVectorizer()
train_x, train_y = train_test_split(traning, random_state=42)
traning_data = vectorizer.fit_transform(train_x.clean)
print(traning_data.shape)
testing_data = vectorizer.transform(testing.clean)
print(testing_data.shape)

(1500, 134977)
(4465, 134977)


In [11]:
vect = vectorizer.get_feature_names_out()
print(vect.size)

134977


In [12]:
testing_data.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [13]:
clf = CategoricalNB()
clf.fit(traning_data.toarray(), train_x.label)

In [14]:
predictions = clf.predict(testing_data.toarray()[2:3])
predictions[0]

1

In [15]:
testing_data.toarray()[2:3]

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
testing

Unnamed: 0,data,label,clean
4263,circle_but:1 vs:1 at_all:1 and_valley:1 the_co...,1,circl_but vs at_all and_valley the_conclus mi...
2475,made_it:1 before_i:1 i'd_better:1 while:1 the_...,0,made_it befor_i id_better while the_foreign a...
1206,is_no:1 providing_valuable:1 about:1 reviewer:...,1,is_no provid_valuabl about review prove_the u...
1464,teen_but:1 get:2 reciepes!!simple_fun:1 made:1...,1,teen_but get get reciepessimpl_fun made veget...
2550,i:8 enough_parts:1 sort:2 middle:1 more_worthw...,0,i i i i i i i i enough_part sort sort middl m...
...,...,...,...
1033,art:1 seduction:2 learn:2 greene:1 believe:1 g...,1,art seduct seduct learn learn green believ gr...
3264,lack_of:1 it_shows:1 journalism:1 how_they:1 i...,0,lack_of it_show journal how_they in_hi high_l...
1653,important_is:1 decision_like:1 martha_rules:1 ...,0,import_is decis_like martha_rule we_loos hone...
2607,your:1 anyone:1 well:1 did_you:1 and_useful:1 ...,1,your anyon well did_you and_use topic so love...
