# Sentiment classifiers

In [1]:
import re
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## sentiment classifier per each category

__paths to change__

In [2]:
# input variables
path = './datos/Multi Domain Sentiment/'

# notebook variables
books_path = path + 'books/'
dvd_path = path + 'dvd/'
electronics_path = path + 'electronics/'
kitchen_path = path + 'kitchen/'

__Read documents methods__

In [3]:
def get_training_data(path: str) -> pd.DataFrame:
    nroute = path + 'negative.review'
    proute = path + 'positive.review'
    negative = pd.read_csv(nroute, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    positive = pd.read_csv(proute, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    training = pd.concat([negative, positive], axis=0, ignore_index=True)
    training['label'] = training.label.map({'negative':0, 'positive':1})
    training = shuffle(training, random_state=0)
    training['clean'] = training.data.apply(preprocessing)
    return training

In [4]:
def get_testing_data(path: str) -> pd.DataFrame:
    route = path + 'unlabeled.review'
    testing = pd.read_csv(route, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    testing['label'] = testing.label.map({'negative':0, 'positive':1})
    testing = shuffle(testing, random_state=0)
    testing['clean'] = testing.data.apply(preprocessing)
    return testing

__preprocessing methods__

In [5]:
def remove_characters(word: str) -> str:
    # remove control characters
    word = re.sub('&.*;', '', word)
    # remove problematic characters
    word = re.sub('-', '_', word)
    word = re.sub('[^a-zA-Z0-9_]', '', word)
    word = re.sub('[0-9]+', '_num_', word)
    word = re.sub('_+', '_', word)
    # remove characters that starts with or ends with _
    word = re.sub('^_+', '', word)
    word = re.sub('_$', '', word)
    return word

In [6]:
porter_stemmer = PorterStemmer()
def preprocessing(document: str) -> str:
    words = document.split()
    result = ''
    for word in words:
        item, cant = word.split(':')
        item = remove_characters(item)
        item = '_'.join([porter_stemmer.stem(word) for word in item.split('_')])
        i, c = 0, int(cant)
        while i < c:
            result = result + ' ' + item
            i += 1
    return result

__model processing methods__

In [7]:
def naive_bayes_prediction(training_data, testing_data, training_label):
    naive_bayes = MultinomialNB()
    naive_bayes.fit(training_data, training_label)
    return naive_bayes.predict(testing_data)

In [8]:
def logistic_regression_prediction(training_data, testing_data, training_label):
    logistic = LogisticRegression(random_state=0, multi_class='multinomial')
    logistic.fit(training_data, training_label)
    return logistic.predict(testing_data)

In [9]:
def print_metrics(testing_label, predictions):
    print('Precision score: ', format(precision_score(testing_label, predictions, average='macro')))
    print('Recall score: ', format(recall_score(testing_label, predictions, average='macro')))
    print('F1 score: ', format(f1_score(testing_label, predictions, average='macro')))
    print('Accuracy score: ', format(accuracy_score(testing_label, predictions)))

__category classifier method__

In [10]:
def category_classifier(paths: list):
    # read the document corpuses
    training, testing = pd.DataFrame(), pd.DataFrame()
    for path in paths:
        training = get_training_data(path)
        training = pd.concat([training, training], axis=0, ignore_index=True)
        testing = get_testing_data(path)
        testing = pd.concat([testing, testing], axis=0, ignore_index=True)
    # count vectorizer
    vectorizer_tf = CountVectorizer()
    training_data_tf = vectorizer_tf.fit_transform(training.clean.values)
    testing_data_tf = vectorizer_tf.transform(testing.clean.values)
    # tfidf vectorizer
    vectorizer_tfidf = TfidfVectorizer()
    training_data_tfidf = vectorizer_tfidf.fit_transform(training.clean.values)
    testing_data_tfidf = vectorizer_tfidf.transform(testing.clean.values)
    # make predictions
    nb_tf = naive_bayes_prediction(training_data_tf, testing_data_tf, training.label.values)
    lr_tf = logistic_regression_prediction(training_data_tf, testing_data_tf, training.label.values)
    nb_idf = naive_bayes_prediction(training_data_tfidf, testing_data_tfidf, training.label.values)
    lr_idf = logistic_regression_prediction(training_data_tfidf, testing_data_tfidf, training.label.values)
    return nb_tf, lr_tf, nb_idf, lr_idf, testing.label.values

In [11]:
def print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels):
    print('\nNaive bayes tf metrics: ')
    print_metrics(labels, nb_tf)
    print('\nLogistic regression tf metrics: ')
    print_metrics(labels, lr_tf)
    print('\nNaive bayes tfidf metrics: ')
    print_metrics(labels, nb_idf)
    print('\nLogistic regression tfidf metrics: ')
    print_metrics(labels, lr_idf)

__clasiffier by category__

In [12]:
print('Books classifier')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([books_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Books classifier

Naive bayes tf metrics: 
Precision score:  0.8293096155530433
Recall score:  0.8274765284973262
F1 score:  0.8267101654596601
Accuracy score:  0.8268756998880179

Logistic regression tf metrics: 
Precision score:  0.8347376151179817
Recall score:  0.8346119977588087
F1 score:  0.8346545235586331
Accuracy score:  0.8347144456886898

Naive bayes tfidf metrics: 
Precision score:  0.8489680669537661
Recall score:  0.8447477295093943
F1 score:  0.843530797474797
Accuracy score:  0.8438969764837626

Logistic regression tfidf metrics: 
Precision score:  0.8549659325038987
Recall score:  0.8550066786218279
F1 score:  0.85487033976147
Accuracy score:  0.8548712206047032


In [13]:
print('Dvd classifier')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([dvd_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Dvd classifier

Naive bayes tf metrics: 
Precision score:  0.8196788123148908
Recall score:  0.819645697373869
F1 score:  0.8195749929136973
Accuracy score:  0.8195761293920804

Logistic regression tf metrics: 
Precision score:  0.8358822735780478
Recall score:  0.8353305940019031
F1 score:  0.8353721373719114
Accuracy score:  0.8354712771890686

Naive bayes tfidf metrics: 
Precision score:  0.8452684021789713
Recall score:  0.8452764575212317
F1 score:  0.8452313473418289
Accuracy score:  0.8452314556609035

Logistic regression tfidf metrics: 
Precision score:  0.8508615343500203
Recall score:  0.8504074934370833
F1 score:  0.8504554113297786
Accuracy score:  0.8505298382598996


In [14]:
print('Electronics classifier')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([electronics_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Electronics classifier

Naive bayes tf metrics: 
Precision score:  0.8480899214267724
Recall score:  0.848077841215007
F1 score:  0.8480825935393668
Accuracy score:  0.8480901249779969

Logistic regression tf metrics: 
Precision score:  0.8582400978299682
Recall score:  0.8578640157220325
F1 score:  0.8578942665103324
Accuracy score:  0.8579475444464003

Naive bayes tfidf metrics: 
Precision score:  0.8669000825222617
Recall score:  0.8663471063071567
F1 score:  0.8661833154314358
Accuracy score:  0.8662207357859532

Logistic regression tfidf metrics: 
Precision score:  0.8705341732827063
Recall score:  0.8704000709950511
F1 score:  0.8704239487062773
Accuracy score:  0.8704453441295547


In [15]:
print('Kitchen classifier')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([kitchen_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Kitchen classifier

Naive bayes tf metrics: 
Precision score:  0.8743267651888342
Recall score:  0.8741185755415648
F1 score:  0.8741504961801903
Accuracy score:  0.8741799831791421

Logistic regression tf metrics: 
Precision score:  0.8783880825057295
Recall score:  0.8784024721422221
F1 score:  0.8783843167466591
Accuracy score:  0.8783851976450799

Naive bayes tfidf metrics: 
Precision score:  0.8799720656525165
Recall score:  0.8794616754800624
F1 score:  0.8795059239575722
Accuracy score:  0.8795626576955424

Logistic regression tfidf metrics: 
Precision score:  0.8818853764599545
Recall score:  0.8816913955588273
F1 score:  0.881723453677926
Accuracy score:  0.8817493692178301


__complete classifier__

In [16]:
print('complete classifier')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([books_path, dvd_path, electronics_path, kitchen_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

complete classifier

Naive bayes tf metrics: 
Precision score:  0.8743267651888342
Recall score:  0.8741185755415648
F1 score:  0.8741504961801903
Accuracy score:  0.8741799831791421

Logistic regression tf metrics: 
Precision score:  0.8783880825057295
Recall score:  0.8784024721422221
F1 score:  0.8783843167466591
Accuracy score:  0.8783851976450799

Naive bayes tfidf metrics: 
Precision score:  0.8799720656525165
Recall score:  0.8794616754800624
F1 score:  0.8795059239575722
Accuracy score:  0.8795626576955424

Logistic regression tfidf metrics: 
Precision score:  0.8818853764599545
Recall score:  0.8816913955588273
F1 score:  0.881723453677926
Accuracy score:  0.8817493692178301
