# Sentiment classifiers

In [1]:
import re
import pandas as pd
from nltk.stem.porter import PorterStemmer
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

## sentiment classifier per each category

__paths to change__

In [2]:
# input variables
path = './datos/Multi Domain Sentiment/'

# notebook variables
books_path = path + 'books/'
dvd_path = path + 'dvd/'
electronics_path = path + 'electronics/'
kitchen_path = path + 'kitchen/'

__preprocessing methods__

In [3]:
def remove_characters(word: str) -> str:
    """
    remove the problematic characters like control characters, and others than alphanumeric
    """
    # remove control characters
    word = re.sub('&.*;', '', word)
    # remove problematic characters
    word = re.sub('-', '_', word)
    word = re.sub('[^a-zA-Z0-9_]', '', word)
    word = re.sub('[0-9]+', '_num_', word)
    word = re.sub('_+', '_', word)
    # remove characters that starts with or ends with _
    word = re.sub('^_+', '', word)
    word = re.sub('_$', '', word)
    return word

In [4]:
porter_stemmer = PorterStemmer()
def preprocessing(document: str) -> dict:
    """
    iterate over all words in document identifing the word and frecuency
    alfter that remove all the problematic characters over the word and stemmer the word
    and return a dictionary with the word as the key and the frecuency as the value
    """
    words = document.split()
    results = {}
    for word in words:
        item, cant = word.split(':')
        item = remove_characters(item)
        item = '_'.join([porter_stemmer.stem(word) for word in item.split('_')])
        i, c = 0, int(cant)
        if results.get(item):
            results[item] = results[item] + c
        else:
            results[item] = c
    return results

__Read documents methods__

In [5]:
def get_training_data(path: str) -> pd.DataFrame:
    """
    read negative and positive review to extract the training data
    refactor the label colummn to get their numeric value
    shuffle the data to add random component to the training
    and apply the preproccesing method to clean the data
    """
    nroute = path + 'negative.review'
    proute = path + 'positive.review'
    negative = pd.read_csv(nroute, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    positive = pd.read_csv(proute, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    training = pd.concat([negative, positive], axis=0, ignore_index=True)
    training['label'] = training.label.map({'negative':0, 'positive':1})
    training = shuffle(training, random_state=0)
    training['clean'] = training.data.apply(preprocessing)
    return training

In [6]:
def get_testing_data(path: str) -> pd.DataFrame:
    """
    read unlabeled review to extract the testing data
    refactor the label colummn to get their numeric value
    shuffle the data to add random component to the training
    and apply the preproccesing method to clean the data
    """
    route = path + 'unlabeled.review'
    testing = pd.read_csv(route, delimiter='#label#:', engine='python', header=None, names=['data', 'label'])
    testing['label'] = testing.label.map({'negative':0, 'positive':1})
    testing = shuffle(testing, random_state=0)
    testing['clean'] = testing.data.apply(preprocessing)
    return testing

__model processing methods__

In [7]:
def naive_bayes_prediction(training_data, testing_data, training_label):
    """
    get the naive bayes prediction from training and testing data
    """
    naive_bayes = MultinomialNB()
    naive_bayes.fit(training_data, training_label)
    return naive_bayes.predict(testing_data)

In [8]:
def logistic_regression_prediction(training_data, testing_data, training_label):
    """
    get the logistic regression prediction from training and testing data
    """
    logistic = LogisticRegression(random_state=0, multi_class='multinomial')
    logistic.fit(training_data, training_label)
    return logistic.predict(testing_data)

In [9]:
def print_metrics(testing_label, predictions):
    """
    extrac the metrics from the testing labels and predictions
    """
    print('Precision score: ', format(precision_score(testing_label, predictions, average='macro')))
    print('Recall score: ', format(recall_score(testing_label, predictions, average='macro')))
    print('F1 score: ', format(f1_score(testing_label, predictions, average='macro')))
    print('Accuracy score: ', format(accuracy_score(testing_label, predictions)))

__create lexicon from data__

In [10]:
def create_lexicon(documents: pd.DataFrame) -> dict:
    """
    create the lexicon from the input data identifying the positive and negative reviews
    extract a dictionary with all the positive and negative frecuencies of a word
    normalize the lexicon to have values between 0 and 1
    """
    lexicon = {}
    for i in range(len(documents.values)):
        label = documents.label.values[i]
        for key, value in documents.clean.values[i].items():
            if lexicon.get(key):
                if label == 1:
                    lexicon[key]['pos'] = lexicon[key]['pos'] + value
                else:
                    lexicon[key]['neg'] = lexicon[key]['neg'] + value
            else:
                if label == 1:
                    lexicon[key] = {'pos': value, 'neg': 0}
                else:
                    lexicon[key] = {'pos': 0, 'neg': value}
    normalized = {}
    for key, value in lexicon.items():
        total = lexicon[key]['pos'] + lexicon[key]['neg']
        normalized[key] = {'pos' : lexicon[key]['pos']/total, 'neg': lexicon[key]['neg']/total}
    return normalized

In [11]:
def lexicon_features(document: dict) -> list:
    """
    return the lexicon features between others:
        - sum of positive scores
        - sum of negative scores
        - sum of positive scores divided over amount of words
        - sum of negative scores divided over amount of words
        - cant of words with positive scores over 0.5
        - cant of words with positive scores bellow 0.5
        - cant of words with positive scores over 0.75
        - cant of words with positive scores bellow 0.75
        - cant of words with positive scores over 0.90
        - cant of words with positive scores bellow 0.90
    """
    neg_scores, pos_scores, cant_words = [], [], 0
    for key, value in document.items():
        item = lexicon.get(key, {})
        pos_scores.append(item.get('pos', 0))
        neg_scores.append(item.get('neg', 0))
    cant_words = len(document)
    pos_score, neg_score = sum(pos_scores), sum(neg_scores)
    len_pos_50, len_neg_50 = len([item for item in pos_scores if item >= 0.5]), len([item for item in pos_scores if item < 0.5])
    len_pos_75, len_neg_75 = len([item for item in pos_scores if item >= 0.75]), len([item for item in pos_scores if item < 0.25])
    len_pos_90, len_neg_90 = len([item for item in pos_scores if item >= 0.9]), len([item for item in pos_scores if item < 0.1])
    pond_pos, pond_neg = sum(pos_scores)/cant_words, sum(neg_scores)/cant_words
    result = [pond_pos, pond_neg, pos_score, neg_score, len_pos_50, len_neg_50, len_pos_75, len_neg_75, len_pos_90, len_neg_90]
    print(result)
    return result

In [12]:
data = get_training_data(books_path)

In [13]:
test = data.sample(10)
test

Unnamed: 0,data,label,clean
798,foods_after:1 book_but:1 foods_but:1 whiz_and:...,0,"{'food_after': 1, 'book_but': 1, 'food_but': 1..."
1259,having_read:1 mccullough's_other:1 read:3 some...,1,"{'have_read': 1, 'mccullough_other': 1, 'read'..."
1422,a_rather:1 this_series:1 college_and:1 out_the...,1,"{'a_rather': 1, 'thi_seri': 1, 'colleg_and': 1..."
610,and_writes:1 comments:1 found_this:1 smugly_su...,0,"{'and_write': 1, 'comment': 1, 'found_thi': 1,..."
1940,lively_writing:1 main_principles:1 catch_of:1 ...,1,"{'live_write': 1, 'main_principl': 1, 'catch_o..."
1486,helpful:2 reading_this:1 you_the:1 the_skills:...,1,"{'help': 2, 'read_thi': 2, 'you_the': 1, 'the_..."
261,i:7 which:1 me:2 say_about:1 anonymous_here's:...,0,"{'i': 7, 'which': 1, 'me': 2, 'say_about': 1, ..."
1465,description:1 evolution_of:1 well:1 goes_furth...,1,"{'descript': 1, 'evolut_of': 1, 'well': 1, 'go..."
721,worst_book:1 disappointment_for:1 dobbs_book:1...,0,"{'worst_book': 1, 'disappoint_for': 1, 'dobb_b..."
1119,love_in:1 universe_read:1 then_when:1 myself_s...,1,"{'love_in': 1, 'univers_read': 1, 'then_when':..."


In [14]:
lexicon = create_lexicon(test)

In [15]:
test['features'] = test.clean.apply(lexicon_features)
test

[0.12612151980573033, 0.8738784801942696, 12.486030460767303, 86.51396953923269, 19, 80, 1, 77, 0, 77]
[0.9327950310559008, 0.06720496894409937, 64.36285714285715, 4.637142857142857, 68, 1, 59, 0, 59, 0]
[0.9583055866289398, 0.04169441337106008, 160.03703296703296, 6.962967032967033, 164, 3, 153, 0, 151, 0]
[0.11040241881024003, 0.8895975811897601, 19.762032967032965, 159.23796703296705, 28, 151, 3, 141, 0, 140]
[0.9679432890203775, 0.03205671097962235, 260.37674474648156, 8.623255253518412, 266, 3, 252, 0, 247, 0]
[0.9139484951695477, 0.08605150483045222, 205.63841141314822, 19.36158858685175, 216, 9, 186, 0, 183, 0]
[0.07585790734488604, 0.924142092655114, 21.922935222672066, 267.07706477732796, 32, 257, 2, 248, 0, 248]
[0.9570517936439726, 0.042948206356027585, 171.3122710622711, 7.687728937728938, 176, 3, 164, 1, 162, 0]
[0.08720484643561567, 0.9127951535643842, 4.534652014652015, 47.46534798534798, 6, 46, 0, 43, 0, 43]
[0.9581240919745074, 0.041875908025492514, 200.24793522267206,

Unnamed: 0,data,label,clean,features
798,foods_after:1 book_but:1 foods_but:1 whiz_and:...,0,"{'food_after': 1, 'book_but': 1, 'food_but': 1...","[0.12612151980573033, 0.8738784801942696, 12.4..."
1259,having_read:1 mccullough's_other:1 read:3 some...,1,"{'have_read': 1, 'mccullough_other': 1, 'read'...","[0.9327950310559008, 0.06720496894409937, 64.3..."
1422,a_rather:1 this_series:1 college_and:1 out_the...,1,"{'a_rather': 1, 'thi_seri': 1, 'colleg_and': 1...","[0.9583055866289398, 0.04169441337106008, 160...."
610,and_writes:1 comments:1 found_this:1 smugly_su...,0,"{'and_write': 1, 'comment': 1, 'found_thi': 1,...","[0.11040241881024003, 0.8895975811897601, 19.7..."
1940,lively_writing:1 main_principles:1 catch_of:1 ...,1,"{'live_write': 1, 'main_principl': 1, 'catch_o...","[0.9679432890203775, 0.03205671097962235, 260...."
1486,helpful:2 reading_this:1 you_the:1 the_skills:...,1,"{'help': 2, 'read_thi': 2, 'you_the': 1, 'the_...","[0.9139484951695477, 0.08605150483045222, 205...."
261,i:7 which:1 me:2 say_about:1 anonymous_here's:...,0,"{'i': 7, 'which': 1, 'me': 2, 'say_about': 1, ...","[0.07585790734488604, 0.924142092655114, 21.92..."
1465,description:1 evolution_of:1 well:1 goes_furth...,1,"{'descript': 1, 'evolut_of': 1, 'well': 1, 'go...","[0.9570517936439726, 0.042948206356027585, 171..."
721,worst_book:1 disappointment_for:1 dobbs_book:1...,0,"{'worst_book': 1, 'disappoint_for': 1, 'dobb_b...","[0.08720484643561567, 0.9127951535643842, 4.53..."
1119,love_in:1 universe_read:1 then_when:1 myself_s...,1,"{'love_in': 1, 'univers_read': 1, 'then_when':...","[0.9581240919745074, 0.041875908025492514, 200..."


__category classifier method__

In [16]:
def category_classifier(paths: list):
    """
    process all the data from the paths from params
    create the tf and tfidf vectorizers
    get the naive bayes and logistic regression predictors
    and return those classifier predictors
    """
    # read the document corpuses
    training, testing = pd.DataFrame(), pd.DataFrame()
    for path in paths:
        new_training = get_training_data(path)
        training = pd.concat([training, new_training], axis=0, ignore_index=True)
        new_testing = get_testing_data(path)
        testing = pd.concat([testing, new_testing], axis=0, ignore_index=True)
    print('training shape: ', training.shape)
    print('testing shape: ', testing.shape)
    # tf vectorizer
    vectorizer_tf = DictVectorizer()
    training_data_tf = vectorizer_tf.fit_transform(training.clean.values)
    testing_data_tf = vectorizer_tf.transform(testing.clean.values)
    # tfidf vectorizer
    vectorizer_tfidf = TfidfTransformer(smooth_idf=False)
    training_data_tfidf = vectorizer_tfidf.fit_transform(training_data_tf)
    testing_data_tfidf = vectorizer_tfidf.transform(testing_data_tf)
    # make predictions
    nb_tf = naive_bayes_prediction(training_data_tf, testing_data_tf, training.label.values)
    lr_tf = logistic_regression_prediction(training_data_tf, testing_data_tf, training.label.values)
    nb_idf = naive_bayes_prediction(training_data_tfidf, testing_data_tfidf, training.label.values)
    lr_idf = logistic_regression_prediction(training_data_tfidf, testing_data_tfidf, training.label.values)
    return nb_tf, lr_tf, nb_idf, lr_idf, testing.label.values

In [17]:
def print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels):
    """
    print the metrics from predictions classifiers
    """
    print('\nNaive bayes tf metrics: ')
    print_metrics(labels, nb_tf)
    print('\nLogistic regression tf metrics: ')
    print_metrics(labels, lr_tf)
    print('\nNaive bayes tfidf metrics: ')
    print_metrics(labels, nb_idf)
    print('\nLogistic regression tfidf metrics: ')
    print_metrics(labels, lr_idf)

__clasiffier by category__

In [18]:
print('Books classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([books_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Books classifier

training shape:  (2000, 3)
testing shape:  (4465, 3)

Naive bayes tf metrics: 
Precision score:  0.8358940198231208
Recall score:  0.8313514536437823
F1 score:  0.8300080595172086
Accuracy score:  0.8304591265397536

Logistic regression tf metrics: 
Precision score:  0.838528342912385
Recall score:  0.8384359502506891
F1 score:  0.8384699815477556
Accuracy score:  0.838521836506159

Naive bayes tfidf metrics: 
Precision score:  0.8441014036371306
Recall score:  0.8352453430258973
F1 score:  0.8331421200272877
Accuracy score:  0.8340425531914893

Logistic regression tfidf metrics: 
Precision score:  0.8494902622512832
Recall score:  0.8491386022736211
F1 score:  0.848808140860472
Accuracy score:  0.8488241881298992


In [19]:
print('Dvd classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([dvd_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Dvd classifier

training shape:  (2000, 3)
testing shape:  (3586, 3)

Naive bayes tf metrics: 
Precision score:  0.822643614054657
Recall score:  0.8226632858974203
F1 score:  0.8226409107876074
Accuracy score:  0.822643614054657

Logistic regression tf metrics: 
Precision score:  0.8358560784380924
Recall score:  0.8353349490598208
F1 score:  0.8353765950824774
Accuracy score:  0.8354712771890686

Naive bayes tfidf metrics: 
Precision score:  0.8502951451182075
Recall score:  0.849856111997158
F1 score:  0.8496628436558256
Accuracy score:  0.8496932515337423

Logistic regression tfidf metrics: 
Precision score:  0.8440918159881093
Recall score:  0.8437274567426095
F1 score:  0.843770616278925
Accuracy score:  0.8438371444506414


In [20]:
print('Electronics classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([electronics_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Electronics classifier

training shape:  (2000, 3)
testing shape:  (5681, 3)

Naive bayes tf metrics: 
Precision score:  0.8498558577763676
Recall score:  0.8498667851239587
F1 score:  0.8498497085095853
Accuracy score:  0.8498503784544974

Logistic regression tf metrics: 
Precision score:  0.8589449629502246
Recall score:  0.8585681408716328
F1 score:  0.8585986319799219
Accuracy score:  0.8586516458370005

Naive bayes tfidf metrics: 
Precision score:  0.8618312933694336
Recall score:  0.8607590967367065
F1 score:  0.8605044142759881
Accuracy score:  0.8605879246611512

Logistic regression tfidf metrics: 
Precision score:  0.8666370357145956
Recall score:  0.866533517894025
F1 score:  0.8665542253323844
Accuracy score:  0.8665727864812534


In [21]:
print('Kitchen classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([kitchen_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

Kitchen classifier

training shape:  (2000, 3)
testing shape:  (5945, 3)

Naive bayes tf metrics: 
Precision score:  0.8752071426596483
Recall score:  0.8747579343763632
F1 score:  0.8747992238201788
Accuracy score:  0.8748528174936921

Logistic regression tf metrics: 
Precision score:  0.8780543578938982
Recall score:  0.8780681357998618
F1 score:  0.8780481041854347
Accuracy score:  0.8780487804878049

Naive bayes tfidf metrics: 
Precision score:  0.8776409361086013
Recall score:  0.8769164636767445
F1 score:  0.8769612003219092
Accuracy score:  0.8770395290159798

Logistic regression tfidf metrics: 
Precision score:  0.8820583486758249
Recall score:  0.8816620817089047
F1 score:  0.8817043310095292
Accuracy score:  0.8817493692178301


__complete classifier__

In [22]:
print('complete classifier\n')
nb_tf, lr_tf, nb_idf, lr_idf, labels = category_classifier([books_path, dvd_path, electronics_path, kitchen_path])
print_predictions(nb_tf, lr_tf, nb_idf, lr_idf, labels)

complete classifier

training shape:  (8000, 3)
testing shape:  (19677, 3)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Naive bayes tf metrics: 
Precision score:  0.8590759757721138
Recall score:  0.8576454692166957
F1 score:  0.8573751116094139
Accuracy score:  0.8574986024292321

Logistic regression tf metrics: 
Precision score:  0.8729639009833036
Recall score:  0.8729330448449437
F1 score:  0.8729414014551151
Accuracy score:  0.8729481120089444

Naive bayes tfidf metrics: 
Precision score:  0.8709388676971344
Recall score:  0.8673292012671421
F1 score:  0.8668074753843702
Accuracy score:  0.8671037251613559

Logistic regression tfidf metrics: 
Precision score:  0.8762298669677598
Recall score:  0.8762220387401352
F1 score:  0.876200524914296
Accuracy score:  0.8762006403415155
