# Multy-Domain Sentiment Dataset: Sentiment Analisys per domain

## 1. Preprocesamiento
Para el preprocesamiento, se creó la clase "Corpus" que recibe una serie de documentos y obtiene las etiquetas de cada uno, el vocabulario, y las representaciones tf y tfidf.

In [3]:
import os
import pandas as pd
import numpy as np
import math
from scipy.sparse import coo_array
from lex import senticnet as lexicon

In [4]:
POSITIVE_LABEL = 1
NEGATIVE_LABEL = 0

CATEGORY_PATHS = {
    "books": "../data/books/",
    "dvd": "../data/dvd/",
    "electronics": "../data/electronics/",
    "kitchen": "../data/kitchen/",
}


class Corpus:
    # Atributes
    __vocabulary = {}
    __tf = None
    __tfidf = None
    __documents = None
    __feature_matrix = None


    def __init__(self, docs: list, vocabulary: dict = None) -> None:
        """
        Creates the vocabulary, the TF and the TFIDF matricees from docs.
        
            Params
            ------
                docs: pd.DataFrame
                    List of documents.
                
                vocabulary: dict | None (default: None)
                    A dictionary where the keys are the terms in the vocabulary
                    and the values are each term's unique id. If set to None,
                    this class will create the vocabulary based on docs.
                    WARNING: if specified, vocabulary must contain a "#UNK#" key
                    for unknown terms.
        """        
        self.__documents = self.__get_docs_df(docs)
        if vocabulary is None:
            self.__vocabulary = self.__load_vocabulary()
        else:
            self.__vocabulary = vocabulary
            
        self.__tf = self.__load_tf()
        self.__tfidf = self.__load_tfidf()
        self.__feature_matrix = self.__load_lex_features()

    def __get_docs_df(self, docs: list):
        documents = []
        for doc in docs:
            terms = self.__get_term_counts(doc)
            label = self.__get_label(doc)
            documents.append([terms, label])
        
        return pd.DataFrame(documents, columns=["terms", "label"])

    def __get_term_counts(self, line: str) -> list:
        line_arr = line.split()
        line_arr = line_arr[:-1] # Remove #label#: from the end of array
        terms = {}
        for term in line_arr:
            term_arr = term.split(":")
            terms[term_arr[0]] = int(term_arr[1])
        
        return terms
    
    def __get_label(self, doc: str) -> int:
        doc_arr = doc.split()
        label_str = doc_arr[-1]
        label = label_str.split(":")[-1]
        if label.lower() == "negative":
            return NEGATIVE_LABEL
        elif label.lower() == "positive":
            return POSITIVE_LABEL
    
    def __load_vocabulary(self):
        # Term counts in the whole corpus
        voc = {}
        docs = self.__documents
        for i in range(len(docs)):
            terms_dict = docs.loc[i, "terms"]
            for term in terms_dict:
                if term in voc:
                    voc[term] += terms_dict[term]
                else:
                    voc[term] = terms_dict[term]
        
        # Replace terms with one appearance with UNK
        terms_to_del = []
        for term in voc:
            if voc[term] == 1:
                terms_to_del.append(term)

        # Remove terms that appeare only once
        for term in terms_to_del:
            voc.pop(term)
        
        # Assign unique ids to terms in vocabulary
        bow = {"#UNK#": 0}
        i = 1
        for term in voc:
            bow[term] = i
            i += 1
        
        return bow

    def __load_tf(self) -> np.ndarray:
        voc = self.__vocabulary
        docs = self.__documents
        data = []
        x = []
        y = []
        for i in range(len(docs)):
            doc = docs.iloc[i]
            for term in doc["terms"]:
                if term in voc:
                    data.append(doc["terms"][term])
                    x.append(i)
                    y.append(voc[term])
                else:
                    data.append(doc["terms"][term])
                    x.append(i)
                    y.append(voc["#UNK#"])
        
        sparce_matrix = coo_array((data, (x,y)), shape=(len(docs), len(voc)), dtype=np.uint64).tocsr()
        return sparce_matrix

    def __load_tfidf(self) -> np.ndarray:
        voc = self.__vocabulary
        docs = self.__documents
        tfs = self.__tf
        doc_frec = np.zeros(len(voc))
        # Calculate document frequencies
        for i in range(len(docs)):
            doc = docs.loc[i, "terms"]
            added_unk = False
            for term in doc:
                if term in voc:
                    term_id = voc[term]
                    doc_frec[term_id] += 1
                elif not added_unk:
                    term_id = voc["#UNK#"]
                    doc_frec[term_id] += 1
                    added_unk = True

        total_docs = len(docs)
        data = []
        x = []
        y = []
        for i in range(len(docs)):
            doc = docs.iloc[i]
            unk_tf = 0
            for term in doc["terms"]:
                if term in voc:
                    term_id = voc[term]
                    tf = doc["terms"][term]
                    df = int(doc_frec[term_id])
                    data.append(math.log10(1+tf) * math.log10(total_docs / df))
                    x.append(i)
                    y.append(term_id)
                else:
                    unk_tf += doc["terms"][term]
            
            # Calculate UNK tfidf
            term_id = voc["#UNK#"]
            tf = int(tfs[i, term_id])
            df = int(doc_frec[term_id])
            temp = math.log10(1+tf) * math.log10(total_docs / df)
            if temp < 0:
                raise Exception("HP")
            data.append(math.log10(1+tf) * math.log10(total_docs / df))
            x.append(i)
            y.append(term_id)

        sparce_matrix = coo_array((data, (x, y)), shape=(len(docs), len(voc)), dtype=np.float64).tocsr()        
        return sparce_matrix
    
    def __load_lex_features(self):
        docs = self.__documents

        moods = set()
        for term in lexicon:
            mood = lexicon[term][4]
            moods.update([mood])

        moods_dict = {mood: index for index, mood in enumerate(moods)}

        feature_matrix = np.zeros((len(docs), len(moods) + 2))

        for i in range(len(docs)):
            doc = docs.loc[i, "terms"]
            for term in doc:
                term_frec = doc[term]
                tot_words_in_lex = 0
                if term in lexicon:
                    tot_words_in_lex += 1
                    if lexicon[term][6] == "positive":
                        feature_matrix[i][0] += term_frec
                    else:
                        feature_matrix[i][1] += term_frec

                    mood = lexicon[term][4]
                    mood_id = moods_dict[mood]
                    feature_matrix[i][mood_id + 2] += term_frec
                else:
                    sub_terms = term.split("_")
                    for sub_term in sub_terms:
                        if sub_term in lexicon:
                            tot_words_in_lex += 1
                            if lexicon[sub_term][6] == "positive":
                                feature_matrix[i][0] += term_frec
                            else:
                                feature_matrix[i][1] += term_frec

                            mood = lexicon[sub_term][4]
                            mood_id = moods_dict[mood]
                            feature_matrix[i][mood_id + 2] += term_frec
            
        return feature_matrix


    def getVocabulary(self) -> dict:
        return self.__vocabulary
    
    def getDocuments(self) -> pd.DataFrame:
        return self.__documents

    def getTfs(self):
        return self.__tf
    
    def getTfidfs(self):
        return self.__tfidf
    
    def getFeatureMatrix(self):
        return self.__feature_matrix

# 2. Construcción del modelo y métricas por categoría

In [5]:
def calculate_metrics(labels: list, predicted_labels: list) -> dict:
    tp = 0
    fp = 0
    fn = 0
    tn = 0

    for i in range(len(labels)):
        act     = labels[i]
        pred    = predicted_labels[i]
        if(act == 1 and pred == 1):
            tp += 1
        elif pred == 1 and act == 0:
            fp += 1
        elif pred == 0 and act == 1:
            fn += 1
        elif pred == 0 and act == 0:
            tn += 1
    
    presition = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 =  2 * presition * recall / (presition + recall)
    accuracy = (tp + tn) / (tp + fp + fn + tn)

    return {"presition": presition, "recall": recall, "f1": f1, "accuracy": accuracy}
    


In [6]:
from sklearn.naive_bayes import MultinomialNB

def test_nv_classifier(X_train: np.ndarray, y_train, X_test: np.ndarray, y_test) -> dict:
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    return calculate_metrics(y_test, y_predicted)

In [7]:
import os

def get_docs_from_folder_paths(folder_paths: list, test_only = False):
    train_docs = []
    test_docs = []
    for path in folder_paths:
        positive_path = os.path.join(path, "positive.review")
        negative_path = os.path.join(path, "negative.review")
        test_path = os.path.join(path, "unlabeled.review")

        f = open(test_path)
        line = f.readline()
        while line != "":
            test_docs.append(line)
            line = f.readline()
        f.close()

        if test_only:
            return test_docs

        f = open(positive_path)
        line = f.readline()
        while line != "":
            train_docs.append(line)
            line = f.readline()
        f.close()

        f = open(negative_path)
        line = f.readline()
        while line != "":
            train_docs.append(line)
            line = f.readline()
        f.close()

    return train_docs, test_docs

In [8]:
from sklearn.linear_model import LogisticRegression
def test_lr_classifier(X_train: np.ndarray, y_train, X_test: np.ndarray, y_test, \
                        penalty = None, solver = "lbfgs", max_iter = 100) -> dict:
    model = LogisticRegression(penalty=penalty, solver=solver, max_iter=max_iter)
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    return calculate_metrics(y_test, y_predicted)

In [21]:
def print_metrics(metrics: dict, end=None):
    print(metrics["presition"], metrics["recall"], metrics["f1"],\
          metrics["accuracy"], sep=";", end=end)

In [22]:
def test_by_cat():
    for cat in CATEGORY_PATHS:
        # === MULTINOMIAL NV ===
        print("=== Test for:", cat, "===")
        folder_path = CATEGORY_PATHS[cat]
        train_docs, test_docs = get_docs_from_folder_paths([folder_path])
        train_corpus = Corpus(train_docs)
        test_corpus = Corpus(test_docs, train_corpus.getVocabulary())
        y_train = train_corpus.getDocuments()["label"]
        y_test = test_corpus.getDocuments()["label"]

        # Test with TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs()
        X_test = test_corpus.getTfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")

        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs()
        X_test = test_corpus.getTfidfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")


        # Test with feature matrix
        print("Feature matrix: ", end="")
        X_train = train_corpus.getFeatureMatrix()
        X_test = test_corpus.getFeatureMatrix()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")


        # print("")

        # === LR ===
        print("=== LR for:", cat, "===")
        # Test with TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs().toarray()
        X_test = test_corpus.getTfs()
        print(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000))
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000), end=";")


        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs().toarray()
        X_test = test_corpus.getTfidfs()
        print(test_lr_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test), end=";")


        # Test with feature matrix
        print("Feature matrix: ", end="")
        X_train = train_corpus.getFeatureMatrix()
        X_test = test_corpus.getFeatureMatrix()
        print(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000))
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000), end=";")

        print("")

In [23]:
test_by_cat()

=== Test for: books ===
0.8158333333333333;0.8648409893992933;0.8396226415094339;0.832474804031355;0.827893175074184;0.8626325088339223;0.8449059052563271;0.839417693169093;0.6186172616357886;0.6046819787985865;0.6115702479338844;0.6105263157894737;0.7816091954022989;0.8710247349823321;0.823898057238354;0.8111982082866741;0.8695848375451264;0.851148409893993;0.8602678571428571;0.8597984322508399;0.6093620546810273;0.6497349823321554;0.628901239846088;0.6111982082866741;
=== Test for: dvd ===
0.7673860911270983;0.8854454897620365;0.8221993833504625;0.8070273284997211;0.797514241325738;0.8522412838959601;0.8239700374531835;0.8165086447295036;0.6283505154639175;0.6745987825124515;0.6506538564184681;0.6349693251533742;0.7798582995951417;0.8527946873270614;0.8146973301612477;0.8045175683212493;0.8128772635814889;0.8942999446596569;0.8516469038208168;0.8430005577244841;0.6404230317273796;0.6032097399003874;0.621259618124822;0.6293920803123257;
=== Test for: electronics ===
0.8196090996475489

## 3. Construcción del modelo y métricas con un compilado de todas las categorías

In [24]:
def test_complete():
    folder_paths = []
    for cat in CATEGORY_PATHS:
        folder_paths.append(CATEGORY_PATHS[cat])

    train_docs, test_docs = get_docs_from_folder_paths(folder_paths)
    train_corpus = Corpus(train_docs)
    y_train = train_corpus.getDocuments()["label"]

    for cat in CATEGORY_PATHS:
        print("Testing for", cat)
        path = CATEGORY_PATHS[cat]
        test_docs = get_docs_from_folder_paths([path], test_only=True)
        test_corpus = Corpus(test_docs, train_corpus.getVocabulary())
        y_test = test_corpus.getDocuments()["label"]

        # === NV ===
        print("NV")
        # TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs()
        X_test = test_corpus.getTfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")

        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs()
        X_test = test_corpus.getTfidfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")

        # Test with feature matrix
        print("Feature matrix:", end="")
        X_train = train_corpus.getFeatureMatrix()
        X_test = test_corpus.getFeatureMatrix()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")

        # print()

        # === LR ===
        print("LR")
        # TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs()
        X_test = test_corpus.getTfs()
        print(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000))
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000), end=";")

        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs()
        X_test = test_corpus.getTfidfs()
        print(test_lr_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test), end=";")


        # Test with feature matrix
        print("Feature matrix:", end="")
        X_train = train_corpus.getFeatureMatrix()
        X_test = test_corpus.getFeatureMatrix()
        print(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000))
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000), end=";")


        print()


In [25]:
test_complete()

Testing for books
0.7981438515081206;0.911660777385159;0.8511340206185567;0.8382978723404255;0.8163841807909604;0.8935512367491166;0.8532264867144665;0.844120940649496;0.6261930010604454;0.5216431095406361;0.5691566265060242;0.5995520716685331;0.8279248505550811;0.8564487632508834;0.8419452887537994;0.8369540873460246;0.8689105403011514;0.8666077738515902;0.8677576293675365;0.8660694288913774;0.6384345794392523;0.482773851590106;0.5497987927565393;0.5991041433370661;
Testing for dvd
0.7984344422700587;0.9031543995572773;0.8475720592054012;0.8363078639152258;0.8283464566929134;0.8732706142778085;0.8502155172413793;0.8449525934188511;0.6217792902284881;0.707802988378528;0.662008281573499;0.6358059118795315;0.8247796785899429;0.8804648588821251;0.851713062098501;0.8455103179029559;0.8705035971223022;0.8705035971223022;0.8705035971223022;0.8694924707194646;0.6158623984710941;0.7133370226895407;0.661025641025641;0.6313441160066927;
Testing for electronics
0.8879699248120301;0.82674133706685