# Sentiment classifiers

In [1]:
import re
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## sentiment classifier per each category

__paths to change__

In [2]:
# input variables
path = './datos/Multi Domain Sentiment/'

# notebook variables
books_path = path + 'books/'
dvd_path = path + 'dvd/'
electronics_path = path + 'electronics/'
kitchen_path = path + 'kitchen/'

__Read documents methods__

In [3]:
def get_training_data(path: str) -> pd.DataFrame:
    nroute = path + 'negative.review'
    proute = path + 'positive.review'
    negative = pd.read_csv(nroute, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    positive = pd.read_csv(proute, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    training = pd.concat([negative, positive], axis=0, ignore_index=True)
    training['label'] = training.label.map({'negative':0, 'positive':1})
    training = shuffle(training, random_state=0)
    training['clean'] = training.data.apply(preprocessing)
    return training

In [4]:
def get_testing_data(path: str) -> pd.DataFrame:
    route = path + 'unlabeled.review'
    testing = pd.read_csv(route, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    testing['label'] = testing.label.map({'negative':0, 'positive':1})
    testing = shuffle(testing, random_state=0)
    testing['clean'] = testing.data.apply(preprocessing)
    return testing

__preprocessing methods__

In [5]:
def remove_characters(word: str) -> str:
    # remove control characters
    word = re.sub('&.*;', '', word)
    # remove problematic characters
    word = re.sub('-', '_', word)
    word = re.sub('[^a-zA-Z0-9_]', '', word)
    word = re.sub('[0-9]+', '_num_', word)
    word = re.sub('_+', '_', word)
    # remove characters that starts with or ends with _
    word = re.sub('^_+', '', word)
    word = re.sub('_$', '', word)
    return word

In [6]:
porter_stemmer = PorterStemmer()
def preprocessing(document: str) -> str:
    words = document.split()
    result = ''
    for word in words:
        item, cant = word.split(':')
        item = remove_characters(item)
        item = '_'.join([porter_stemmer.stem(word) for word in item.split('_')])
        i, c = 0, int(cant)
        while i < c:
            result = result + ' ' + item
            i += 1
    return result

__model processing methods__

In [7]:
def naive_bayes_prediction(training_data, testing_data, training_label):
    naive_bayes = MultinomialNB()
    naive_bayes.fit(training_data, training_label)
    return naive_bayes.predict(testing_data)

In [8]:
def logistic_regression_prediction(training_data, testing_data, training_label):
    logistic = LogisticRegression(random_state=0, multi_class='multinomial')
    logistic.fit(training_data, training_label)
    return logistic.predict(testing_data)

In [9]:
def print_metrics(testing_label, predictions):
    print('Precision score: ', format(precision_score(testing_label, predictions, average='macro')))
    print('Recall score: ', format(recall_score(testing_label, predictions, average='macro')))
    print('F1 score: ', format(f1_score(testing_label, predictions, average='macro')))
    print('Accuracy score: ', format(accuracy_score(testing_label, predictions)))

__category classifier method__

In [10]:
def category_classifier(paths: list):
    # read the document corpuses
    training, testing = pd.DataFrame(), pd.DataFrame()
    for path in paths:
        new_training = get_training_data(path)
        training = pd.concat([training, new_training], axis=0, ignore_index=True)
        new_testing = get_testing_data(path)
        testing = pd.concat([testing, new_testing], axis=0, ignore_index=True)
    print('training shape: ', training.shape)
    print('testing shape: ', testing.shape)
    # count vectorizer
    vectorizer_tf = CountVectorizer()
    training_data_tf = vectorizer_tf.fit_transform(training.clean.values)
    testing_data_tf = vectorizer_tf.transform(testing.clean.values)
    # tfidf vectorizer
    vectorizer_tfidf = TfidfVectorizer()
    training_data_tfidf = vectorizer_tfidf.fit_transform(training.clean.values)
    testing_data_tfidf = vectorizer_tfidf.transform(testing.clean.values)
    # make predictions
    nb_tf = naive_bayes_prediction(training_data_tf, testing_data_tf, training.label.values)
    lr_tf = logistic_regression_prediction(training_data_tf, testing_data_tf, training.label.values)
    nb_idf = naive_bayes_prediction(training_data_tfidf, testing_data_tfidf, training.label.values)
    lr_idf = logistic_regression_prediction(training_data_tfidf, testing_data_tfidf, training.label.values)
    return nb_tf, lr_tf, nb_idf, lr_idf, testing.label.values

In [11]:
def print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels):
    print('\nNaive bayes tf metrics: ')
    print_metrics(labels, nb_tf)
    print('\nLogistic regression tf metrics: ')
    print_metrics(labels, lr_tf)
    print('\nNaive bayes tfidf metrics: ')
    print_metrics(labels, nb_idf)
    print('\nLogistic regression tfidf metrics: ')
    print_metrics(labels, lr_idf)

__clasiffier by category__

In [12]:
print('Books classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([books_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Books classifier

training shape:  (2000, 3)
testing shape:  (4465, 3)

Naive bayes tf metrics: 
Precision score:  0.8368396694255156
Recall score:  0.8328657829801103
F1 score:  0.8316437462520466
Accuracy score:  0.832026875699888

Logistic regression tf metrics: 
Precision score:  0.835203031671937
Recall score:  0.8350473724599965
F1 score:  0.8350968552065444
Accuracy score:  0.8351623740201568

Naive bayes tfidf metrics: 
Precision score:  0.846643036934299
Recall score:  0.8385517424620675
F1 score:  0.8366093224911268
Accuracy score:  0.8374020156774916

Logistic regression tfidf metrics: 
Precision score:  0.8533605268165319
Recall score:  0.8532904654646217
F1 score:  0.8530782618186867
Accuracy score:  0.8530795072788354


In [13]:
print('Dvd classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([dvd_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Dvd classifier

training shape:  (2000, 3)
testing shape:  (3586, 3)

Naive bayes tf metrics: 
Precision score:  0.8223677169665746
Recall score:  0.8223343234868584
F1 score:  0.8223458389067001
Accuracy score:  0.8223647518126046

Logistic regression tf metrics: 
Precision score:  0.8337599064874343
Recall score:  0.8330821398141572
F1 score:  0.8331207811875008
Accuracy score:  0.8332403792526492

Naive bayes tfidf metrics: 
Precision score:  0.8516176997804146
Recall score:  0.8514727717112858
F1 score:  0.8513592006576678
Accuracy score:  0.8513664249860569

Logistic regression tfidf metrics: 
Precision score:  0.8471114855608526
Recall score:  0.8468060160770075
F1 score:  0.8468479268372847
Accuracy score:  0.8469046291132181


In [14]:
print('Electronics classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([electronics_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Electronics classifier

training shape:  (2000, 3)
testing shape:  (5681, 3)

Naive bayes tf metrics: 
Precision score:  0.8509050946416147
Recall score:  0.8509168376265839
F1 score:  0.8509050337588774
Accuracy score:  0.8509065305403978

Logistic regression tf metrics: 
Precision score:  0.8585925303900965
Recall score:  0.8582160782968327
F1 score:  0.8582464492451272
Accuracy score:  0.8582995951417004

Naive bayes tfidf metrics: 
Precision score:  0.8615233569670502
Recall score:  0.8605738626166436
F1 score:  0.860338766379035
Accuracy score:  0.8604118993135011

Logistic regression tfidf metrics: 
Precision score:  0.8684210330544082
Recall score:  0.8682876955462504
F1 score:  0.8683112956960536
Accuracy score:  0.8683330399577539


In [15]:
print('Kitchen classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([kitchen_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Kitchen classifier

training shape:  (2000, 3)
testing shape:  (5945, 3)

Naive bayes tf metrics: 
Precision score:  0.8756595383090214
Recall score:  0.875269908122019
F1 score:  0.8753099705235576
Accuracy score:  0.8753574432296047

Logistic regression tf metrics: 
Precision score:  0.8782211775261937
Recall score:  0.878235303971042
F1 score:  0.8782162137691412
Accuracy score:  0.8782169890664424

Naive bayes tfidf metrics: 
Precision score:  0.8772637337717557
Recall score:  0.8765842211808071
F1 score:  0.8766287044604778
Accuracy score:  0.8767031118587048

Logistic regression tfidf metrics: 
Precision score:  0.8814645466514797
Recall score:  0.8811710464274792
F1 score:  0.8812090857302257
Accuracy score:  0.8812447434819176


__complete classifier__

In [16]:
print('complete classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([books_path, dvd_path, electronics_path, kitchen_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

complete classifier

training shape:  (8000, 3)
testing shape:  (19677, 3)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Naive bayes tf metrics: 
Precision score:  0.8601990601990601
Recall score:  0.8590536477447666
F1 score:  0.8588256274223927
Accuracy score:  0.8589215835747319

Logistic regression tf metrics: 
Precision score:  0.8728142284868005
Recall score:  0.8727794560809901
F1 score:  0.8727884530633363
Accuracy score:  0.8727956497433552

Naive bayes tfidf metrics: 
Precision score:  0.8722615321968519
Recall score:  0.8689460596756893
F1 score:  0.8684633055513065
Accuracy score:  0.8687299893276414

Logistic regression tfidf metrics: 
Precision score:  0.877392698579508
Recall score:  0.8773889166281572
F1 score:  0.8773694860386476
Accuracy score:  0.8773695177110332
