# Sentiment classifiers

In [1]:
import re
import pandas as pd
import numpy as np
from time import time
from scipy.sparse import csr_matrix
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

__paths to change__

In [2]:
# input variables
path = './datos/Multi Domain Sentiment/'

# notebook variables
books_path = path + 'books/'
dvd_path = path + 'dvd/'
electronics_path = path + 'electronics/'
kitchen_path = path + 'kitchen/'

__preprocessing methods__

In [3]:
def remove_characters(word: str) -> str:
    """
    remove the problematic characters like control characters, and others than alphanumeric
    """
    # remove control characters
    word = re.sub('&.*;', '', word)
    # remove problematic characters
    word = re.sub('-', '_', word)
    word = re.sub('[^a-zA-Z0-9_]', '', word)
    word = re.sub('[0-9]+', '_num_', word)
    word = re.sub('_+', '_', word)
    # remove characters that starts with or ends with _
    word = re.sub('^_+', '', word)
    word = re.sub('_$', '', word)
    return word

In [4]:
def preprocessing(document: str) -> dict:
    """
    iterate over all words in document identifing the word and frecuency
    remove all the problematic characters over the word
    and return a dictionary with the word as the key and the frecuency as the value
    """
    words = document.split()
    results = {}
    for word in words:
        item, cant = word.split(':')
        item = remove_characters(item)
        results[item] = int(cant)
    return results

__create lexicon from data__

In [5]:
def create_lexicon(documents: pd.DataFrame) -> dict:
    """
    create the lexicon from the input data identifying the positive and negative reviews
    extract a dictionary with all the positive and negative frecuencies of a word
    normalize the lexicon to have values between 0 and 1
    """
    lexicon = {}
    for i in range(len(documents.values)):
        label = documents.label.values[i]
        for key, value in documents.clean.values[i].items():
            if lexicon.get(key):
                if label == 1:
                    lexicon[key]['pos'] = lexicon[key]['pos'] + value
                else:
                    lexicon[key]['neg'] = lexicon[key]['neg'] + value
            else:
                if label == 1:
                    lexicon[key] = {'pos': value, 'neg': 0}
                else:
                    lexicon[key] = {'pos': 0, 'neg': value}
    normalized = {}
    for key, value in lexicon.items():
        total = lexicon[key]['pos'] + lexicon[key]['neg']
        normalized[key] = {'pos' : lexicon[key]['pos']/total, 'neg': lexicon[key]['neg']/total}
    return normalized

In [6]:
def lexicon_features(document: dict, lexicon: dict) -> list:
    """
    return the lexicon features between others:
        - sum of positive scores
        - sum of negative scores
        - sum of positive scores divided over amount of words
        - sum of negative scores divided over amount of words
        - cant of words with positive scores over 0.5
        - cant of words with negative scores over 0.5
    """
    neg_scores, pos_scores, cant_words = [], [], 0
    for key, value in document.items():
        item = lexicon.get(key, {})
        pos_scores.append(item.get('pos', 0))
        neg_scores.append(item.get('neg', 0))
    cant_words = len(document)
    pos_score, neg_score = sum(pos_scores), sum(neg_scores)
    pond_pos, pond_neg = pos_score/cant_words, neg_score/cant_words
    cant_pos, cant_neg = len([item for item in pos_scores if item >= 0.5]), len([item for item in neg_scores if item >= 0.5])
    return [pond_pos, pond_neg, pos_score, neg_score, cant_pos, cant_neg]

__Read documents methods__

In [7]:
def get_training_data(path: str) -> pd.DataFrame:
    """
    read negative and positive review to extract the training data
    refactor the label colummn to get their numeric value
    shuffle the data to add random component to the training
    and apply the preproccesing method to clean the data
    """
    nroute = path + 'negative.review'
    proute = path + 'positive.review'
    negative = pd.read_csv(nroute, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    positive = pd.read_csv(proute, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    training = pd.concat([negative, positive], axis=0, ignore_index=True)
    training['label'] = training.label.map({'negative':0, 'positive':1})
    training = shuffle(training, random_state=0)
    training['clean'] = training.data.apply(preprocessing)
    return training

In [8]:
def get_testing_data(path: str) -> pd.DataFrame:
    """
    read unlabeled review to extract the testing data
    refactor the label colummn to get their numeric value
    shuffle the data to add random component to the training
    and apply the preproccesing method to clean the data
    """
    route = path + 'unlabeled.review'
    testing = pd.read_csv(route, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    testing['label'] = testing.label.map({'negative':0, 'positive':1})
    testing = shuffle(testing, random_state=0)
    testing['clean'] = testing.data.apply(preprocessing)
    return testing

__model processing methods__

In [9]:
def naive_bayes_prediction(training_data, testing_data, training_label):
    """
    get the naive bayes prediction from training and testing data
    """
    naive_bayes = MultinomialNB()
    naive_bayes.fit(training_data, training_label)
    return naive_bayes.predict(testing_data)

In [10]:
def logistic_regression_prediction(training_data, testing_data, training_label):
    """
    get the logistic regression prediction from training and testing data
    """
    logistic = LogisticRegression(random_state=0, multi_class='multinomial')
    logistic.fit(training_data, training_label)
    return logistic.predict(testing_data)

In [11]:
def print_metrics(testing_label, predictions):
    """
    extrac the metrics from the testing labels and predictions
    """
    print('Precision score: ', format(precision_score(testing_label, predictions, average='macro')))
    print('Recall score: ', format(recall_score(testing_label, predictions, average='macro')))
    print('F1 score: ', format(f1_score(testing_label, predictions, average='macro')))
    print('Accuracy score: ', format(accuracy_score(testing_label, predictions)))

__category classifier method__

In [12]:
def category_classifier(paths: list):
    """
    process all the data from the paths from params
    create the tf and tfidf vectorizers
    get the naive bayes and logistic regression predictors
    and return those classifier predictors
    """
    # read the document corpuses
    start = time()
    training, testing = pd.DataFrame(), pd.DataFrame()
    for path in paths:
        new_training = get_training_data(path)
        training = pd.concat([training, new_training], axis=0, ignore_index=True)
        new_testing = get_testing_data(path)
        testing = pd.concat([testing, new_testing], axis=0, ignore_index=True)
    end = time()
    print('\ngetting training and testing data elapsed time: ' + str(end-start))

    print('\ntraining shape: ', training.shape)
    print('\ntesting shape: ', testing.shape)

    # tf vectorizer
    start = time()
    vectorizer_tf = DictVectorizer()
    training_data_tf = vectorizer_tf.fit_transform(training.clean.values)
    testing_data_tf = vectorizer_tf.transform(testing.clean.values)
    end = time()
    print('\nvectorizer tf data elapsed time: ' + str(end-start))

    # tfidf vectorizer
    start = time()
    vectorizer_tfidf = TfidfTransformer(smooth_idf=False)
    training_data_tfidf = vectorizer_tfidf.fit_transform(training_data_tf)
    testing_data_tfidf = vectorizer_tfidf.transform(testing_data_tf)
    end = time()
    print('\nvectorizer tfidf data elapsed time: ' + str(end-start))

    # features matrix
    start = time()
    lexicon = create_lexicon(training)
    training_data_feat = csr_matrix(list(training.clean.apply(lexicon_features, lexicon=lexicon)))
    testing_data_feat = csr_matrix(list(testing.clean.apply(lexicon_features, lexicon=lexicon)))
    end = time()
    print('\nlexicon features matrix data elapsed time: ' + str(end-start))

    # make predictions
    start = time()
    nb_tf = naive_bayes_prediction(training_data_tf, testing_data_tf, training.label.values)
    end = time()
    print('\nnaive bayes prediction ft: ' + str(end-start))

    start = time()
    lr_tf = logistic_regression_prediction(training_data_tf, testing_data_tf, training.label.values)
    end = time()
    print('\nlogistic regression prediction ft: ' + str(end-start))

    start = time()
    nb_idf = naive_bayes_prediction(training_data_tfidf, testing_data_tfidf, training.label.values)
    end = time()
    print('\nnaive bayes prediction ftidf: ' + str(end-start))
    
    start = time()
    lr_idf = logistic_regression_prediction(training_data_tfidf, testing_data_tfidf, training.label.values)
    end = time()
    print('\nlogistic regression prediction ftidf: ' + str(end-start))
    
    start = time()
    nb_feat = naive_bayes_prediction(training_data_feat, testing_data_feat, training.label.values)
    end = time()
    print('\nnaive bayes prediction features matrix: ' + str(end-start))
    
    start = time()
    lr_feat = logistic_regression_prediction(training_data_feat, testing_data_feat, training.label.values)
    end = time()
    print('\nlogistic regression prediction feature matrix: ' + str(end-start))
    
    print('\nNaive bayes tf metrics: ')
    print_metrics(testing.label.values, nb_tf)

    print('\nLogistic regression tf metrics: ')
    print_metrics(testing.label.values, lr_tf)

    print('\nNaive bayes tfidf metrics: ')
    print_metrics(testing.label.values, nb_idf)

    print('\nLogistic regression tfidf metrics: ')
    print_metrics(testing.label.values, lr_idf)

    print('\nNaive bayes features metrics: ')
    print_metrics(testing.label.values, nb_feat)

    print('\nLogistic regression features metrics: ')
    print_metrics(testing.label.values, lr_feat)

## sentiment classifier per each category

In [13]:
print('Books classifier\n')
category_classifier([books_path])

Books classifier


getting training and testing data elapsed time: 32.29820418357849

training shape:  (2000, 3)

testing shape:  (4465, 3)

vectorizer tf data elapsed time: 4.060862064361572

vectorizer tfidf data elapsed time: 0.14737963676452637

lexicon features matrix data elapsed time: 3.4545013904571533

naive bayes prediction ft: 0.04754948616027832

logistic regression prediction ft: 11.679904460906982

naive bayes prediction ftidf: 0.05852341651916504

logistic regression prediction ftidf: 5.159057855606079

naive bayes prediction features matrix: 0.00766444206237793

logistic regression prediction feature matrix: 0.040157318115234375

Naive bayes tf metrics: 
Precision score:  0.8350342656009614
Recall score:  0.8294270352538117
F1 score:  0.8278642502163154
Accuracy score:  0.8284434490481523

Logistic regression tf metrics: 
Precision score:  0.831096062202157
Recall score:  0.8311416429730785
F1 score:  0.8311106790147239
Accuracy score:  0.8311310190369541

Naive bayes t

In [14]:
print('Dvd classifier\n')
category_classifier([dvd_path])

Dvd classifier


getting training and testing data elapsed time: 23.63008952140808

training shape:  (2000, 3)

testing shape:  (3586, 3)

vectorizer tf data elapsed time: 4.0336689949035645

vectorizer tfidf data elapsed time: 0.14813947677612305

lexicon features matrix data elapsed time: 3.099989891052246

naive bayes prediction ft: 0.059864044189453125

logistic regression prediction ft: 9.384239673614502

naive bayes prediction ftidf: 0.04687976837158203

logistic regression prediction ftidf: 4.167803764343262

naive bayes prediction features matrix: 0.0004978179931640625

logistic regression prediction feature matrix: 0.0634918212890625

Naive bayes tf metrics: 
Precision score:  0.8213267556139578
Recall score:  0.8213102627250903
F1 score:  0.8212489553338185
Accuracy score:  0.8212493028443949

Logistic regression tf metrics: 
Precision score:  0.8258791154852412
Recall score:  0.8252822310837282
F1 score:  0.8253172610360089
Accuracy score:  0.8254322364751813

Naive bayes tf

In [15]:
print('Electronics classifier\n')
category_classifier([electronics_path])

Electronics classifier


getting training and testing data elapsed time: 24.97767925262451

training shape:  (2000, 3)

testing shape:  (5681, 3)

vectorizer tf data elapsed time: 4.075480222702026

vectorizer tfidf data elapsed time: 0.09634923934936523

lexicon features matrix data elapsed time: 3.1928648948669434

naive bayes prediction ft: 0.04696822166442871

logistic regression prediction ft: 6.6632301807403564

naive bayes prediction ftidf: 0.029967069625854492

logistic regression prediction ftidf: 1.8210182189941406

naive bayes prediction features matrix: 0.008457422256469727

logistic regression prediction feature matrix: 0.05720663070678711

Naive bayes tf metrics: 
Precision score:  0.8545982734118966
Recall score:  0.8546083819771726
F1 score:  0.8546006795972588
Accuracy score:  0.8546030628410491

Logistic regression tf metrics: 
Precision score:  0.8601032371353003
Recall score:  0.8598116077900213
F1 score:  0.85984125941143
Accuracy score:  0.8598838232705509

Naive 

In [16]:
print('Kitchen classifier\n')
category_classifier([kitchen_path])

Kitchen classifier


getting training and testing data elapsed time: 19.120525360107422

training shape:  (2000, 3)

testing shape:  (5945, 3)

vectorizer tf data elapsed time: 3.0594944953918457

vectorizer tfidf data elapsed time: 0.06249809265136719

lexicon features matrix data elapsed time: 2.0406932830810547

naive bayes prediction ft: 0.015642404556274414

logistic regression prediction ft: 3.996230363845825

naive bayes prediction ftidf: 0.03127288818359375

logistic regression prediction ftidf: 2.586960792541504

naive bayes prediction features matrix: 0.0

logistic regression prediction feature matrix: 0.07965326309204102

Naive bayes tf metrics: 
Precision score:  0.8800992246016999
Recall score:  0.8796351251905117
F1 score:  0.8796785203566759
Accuracy score:  0.87973086627418

Logistic regression tf metrics: 
Precision score:  0.8804134439810729
Recall score:  0.8803854578857312
F1 score:  0.8803952403083994
Accuracy score:  0.8804037005887301

Naive bayes tfidf metrics: 

## sentiment classifier complete

In [17]:
print('complete classifier\n')
category_classifier([books_path, dvd_path, electronics_path, kitchen_path])

complete classifier


getting training and testing data elapsed time: 97.85711336135864

training shape:  (8000, 3)

testing shape:  (19677, 3)

vectorizer tf data elapsed time: 14.482530355453491

vectorizer tfidf data elapsed time: 0.573575496673584

lexicon features matrix data elapsed time: 12.555979013442993

naive bayes prediction ft: 0.13494586944580078


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



logistic regression prediction ft: 30.15519070625305

naive bayes prediction ftidf: 0.12407326698303223

logistic regression prediction ftidf: 17.98591637611389

naive bayes prediction features matrix: 0.0

logistic regression prediction feature matrix: 0.22345662117004395

Naive bayes tf metrics: 
Precision score:  0.8631942892515936
Recall score:  0.8617613567508546
F1 score:  0.8614964006712913
Accuracy score:  0.8616150836001423

Logistic regression tf metrics: 
Precision score:  0.8747372744108918
Recall score:  0.8747142777887805
F1 score:  0.8747212006666737
Accuracy score:  0.8747268384408192

Naive bayes tfidf metrics: 
Precision score:  0.8752871757223706
Recall score:  0.8717473228506794
F1 score:  0.8712479406019544
Accuracy score:  0.8715251308634446

Logistic regression tfidf metrics: 
Precision score:  0.8753261141773242
Recall score:  0.8752285235301829
F1 score:  0.8751795052716007
Accuracy score:  0.875184225237587

Naive bayes features metrics: 
Precision score:  0.