# Multy-Domain Sentiment Dataset: Sentiment Analisys per domain

## 1. Preprocesamiento
Para el preprocesamiento, se creó la clase "Corpus" que recibe una serie de documentos y obtiene las etiquetas de cada uno, el vocabulario, y las representaciones tf y tfidf.

In [64]:
import os
import pandas as pd
import numpy as np
import math
from scipy.sparse import coo_array
from lex import senticnet as lexicon

In [65]:
POSITIVE_LABEL = 1
NEGATIVE_LABEL = 0

CATEGORY_PATHS = {
    "books": "../data/books/",
    "dvd": "../data/dvd/",
    "electronics": "../data/electronics/",
    "kitchen": "../data/kitchen/",
}


class Corpus:
    # Atributes
    __vocabulary = {}
    __tf = None
    __tfidf = None
    __documents = None
    __feature_matrix = None
    __voc_lst = None
    __mood_lst = None


    def __init__(self, docs: list, vocabulary: dict = None) -> None:
        """
        Creates the vocabulary, the TF and the TFIDF matricees from docs.
        
            Params
            ------
                docs: pd.DataFrame
                    List of documents.
                
                vocabulary: dict | None (default: None)
                    A dictionary where the keys are the terms in the vocabulary
                    and the values are each term's unique id. If set to None,
                    this class will create the vocabulary based on docs.
                    WARNING: if specified, vocabulary must contain a "#UNK#" key
                    for unknown terms.
        """        
        self.__documents = self.__get_docs_df(docs)
        if vocabulary is None:
            self.__vocabulary, self.__voc_lst = self.__load_vocabulary()
        else:
            self.__vocabulary = vocabulary
            self.__voc_lst = ["" for i in range(len(vocabulary))]
            for term in vocabulary:
                i = vocabulary[term]
                self.__voc_lst[i] = term
            
        self.__tf = self.__load_tf()
        self.__tfidf = self.__load_tfidf()
        self.__feature_matrix, self.__mood_lst = self.__load_lex_features()

    def __get_docs_df(self, docs: list):
        """
        Turns a list of documenst into a pd.DataFrame with "terms" and "labels"
            Params
            ------
                docs: list
                    A list with documents.
        """
        documents = []
        for doc in docs:
            terms = self.__get_term_counts(doc)
            label = self.__get_label(doc)
            documents.append([terms, label])
        
        return pd.DataFrame(documents, columns=["terms", "label"])

    def __get_term_counts(self, line: str) -> dict:
        """
        Turns a document into a dictionary where keys are terms and values are
        the frequencies of those terms.
        """
        line_arr = line.split()
        line_arr = line_arr[:-1] # Remove #label#: from the end of array
        terms = {}
        for term in line_arr:
            term_arr = term.split(":")
            terms[term_arr[0]] = int(term_arr[1])
        
        return terms
    
    def __get_label(self, doc: str) -> int:
        """
        Returns the label of a document.
        """
        doc_arr = doc.split()
        label_str = doc_arr[-1]
        label = label_str.split(":")[-1]
        if label.lower() == "negative":
            return NEGATIVE_LABEL
        elif label.lower() == "positive":
            return POSITIVE_LABEL
    
    def __load_vocabulary(self):
        """
        Extracts the vocabulary from the documents and returns a tuple.
        The first entry of the tuple contains a dictionary where keys are
        terms of the dictionary and values are a unique id for each term.
        The second entry of the tuple contains a list with the vocabylary
        organized.
        """
        # Term counts in the whole corpus
        voc = {}
        voc_lst = []
        docs = self.__documents
        for i in range(len(docs)):
            terms_dict = docs.loc[i, "terms"]
            for term in terms_dict:
                if term in voc:
                    voc[term] += terms_dict[term]
                else:
                    voc[term] = terms_dict[term]
        
        # Replace terms with one appearance with UNK
        terms_to_del = []
        for term in voc:
            if voc[term] == 1:
                terms_to_del.append(term)

        # Remove terms that appeare only once
        for term in terms_to_del:
            voc.pop(term)
        
        # Assign unique ids to terms in vocabulary
        bow = {"#UNK#": 0}
        voc_lst.append("#UNK#")
        i = 1
        for term in voc:
            voc_lst.append(term)
            bow[term] = i
            i += 1
        
        return bow, voc_lst

    def __load_tf(self) -> np.ndarray:
        """
        Creates and returns the tf representation of the documents.
        """
        voc = self.__vocabulary
        docs = self.__documents
        data = []
        x = []
        y = []
        for i in range(len(docs)):
            doc = docs.iloc[i]
            for term in doc["terms"]:
                if term in voc:
                    data.append(doc["terms"][term])
                    x.append(i)
                    y.append(voc[term])
                else:
                    data.append(doc["terms"][term])
                    x.append(i)
                    y.append(voc["#UNK#"])
        
        sparce_matrix = coo_array((data, (x,y)), shape=(len(docs), len(voc)), dtype=np.uint64).tocsr()
        return sparce_matrix

    def __load_tfidf(self) -> np.ndarray:
        """
        Creates and returns the tf-idf representation of the documents.
        """
        voc = self.__vocabulary
        docs = self.__documents
        tfs = self.__tf
        doc_frec = np.zeros(len(voc))
        # Calculate document frequencies
        for i in range(len(docs)):
            doc = docs.loc[i, "terms"]
            added_unk = False
            for term in doc:
                if term in voc:
                    term_id = voc[term]
                    doc_frec[term_id] += 1
                elif not added_unk:
                    term_id = voc["#UNK#"]
                    doc_frec[term_id] += 1
                    added_unk = True

        total_docs = len(docs)
        data = []
        x = []
        y = []
        for i in range(len(docs)):
            doc = docs.iloc[i]
            unk_tf = 0
            for term in doc["terms"]:
                if term in voc:
                    term_id = voc[term]
                    tf = doc["terms"][term]
                    df = int(doc_frec[term_id])
                    data.append(math.log10(1+tf) * math.log10(total_docs / df))
                    x.append(i)
                    y.append(term_id)
                else:
                    unk_tf += doc["terms"][term]
            
            # Calculate UNK tfidf
            term_id = voc["#UNK#"]
            tf = int(tfs[i, term_id])
            df = int(doc_frec[term_id])
            temp = math.log10(1+tf) * math.log10(total_docs / df)
            if temp < 0:
                raise Exception("HP")
            data.append(math.log10(1+tf) * math.log10(total_docs / df))
            x.append(i)
            y.append(term_id)

        sparce_matrix = coo_array((data, (x, y)), shape=(len(docs), len(voc)), dtype=np.float64).tocsr()        
        return sparce_matrix
    
    def __load_lex_features(self):
        """
        Creates and returns a vector representation of the document using the
        followeing features from a lexicon:
            - Count of positive words
            - Count of negative words
            - Count of words with each mood (for example, count of words
                whose mood is "joy").
        """
        docs = self.__documents

        moods = set()
        for term in lexicon:
            mood = lexicon[term][4]
            moods.update([mood])

        moods_lst = ["" for i in range(len(moods))]
        moods_lst.insert(0, "PWC")
        moods_lst.insert(1, "NWC")

        moods_dict = {mood: index for index, mood in enumerate(moods)}

        for mood in moods_dict:
            i = moods_dict[mood] + 2
            moods_lst[i] = mood

        feature_matrix = np.zeros((len(docs), len(moods) + 2))

        for i in range(len(docs)):
            doc = docs.loc[i, "terms"]
            for term in doc:
                term_frec = doc[term]
                tot_words_in_lex = 0
                if term in lexicon:
                    tot_words_in_lex += 1
                    if lexicon[term][6] == "positive":
                        feature_matrix[i][0] += term_frec
                    else:
                        feature_matrix[i][1] += term_frec

                    mood = lexicon[term][4]
                    mood_id = moods_dict[mood]
                    feature_matrix[i][mood_id + 2] += term_frec
                else:
                    sub_terms = term.split("_")
                    for sub_term in sub_terms:
                        if sub_term in lexicon:
                            tot_words_in_lex += 1
                            if lexicon[sub_term][6] == "positive":
                                feature_matrix[i][0] += term_frec
                            else:
                                feature_matrix[i][1] += term_frec

                            mood = lexicon[sub_term][4]
                            mood_id = moods_dict[mood]
                            feature_matrix[i][mood_id + 2] += term_frec
            
        return feature_matrix, moods_lst


    def getVocabulary(self) -> dict:
        """
        Returns a dictionary where keys are the terms in the vocabulary
        and values are the unique id of those terms.
        """
        return self.__vocabulary
    
    def getDocuments(self) -> pd.DataFrame:
        """
        Returns a pandas dataframe where each row represents
        a document of the corpus. The dataframe has two columns: terms and
        label which represent the terms of the document and its label respectively.
        """
        return self.__documents

    def getTfs(self):
        """
        Returns the TF matrix of the corpus.
        """
        return self.__tf
    
    def getTfidfs(self):
        """
        Returns the TFIDF matrix of the corpus.
        """
        return self.__tfidf
    
    def getFeatureMatrix(self):
        """
        Returns the feature matrix of the corpus.
        """
        return self.__feature_matrix

    def getVocLst(self):
        """
        Returns a list with the vocabulary of the corpus.
        """
        return self.__voc_lst
    
    def getMoodsLst(self):
        """
        Returns a list where the k-th entry contains the name
        of the k-th feature in the feature matrix
        """
        return self.__mood_lst

# 2. Construcción del modelo y métricas por categoría

In [66]:
def calculate_metrics(labels: list, predicted_labels: list) -> dict:
    """
    Given the true labels and the predicted labels, calculates and returns
    the following metrics inside a dictionary:
        - presition
        - recall
        - f1 score
        - accuracy
    """
    tp = 0
    fp = 0
    fn = 0
    tn = 0

    for i in range(len(labels)):
        act     = labels[i]
        pred    = predicted_labels[i]
        if(act == 1 and pred == 1):
            tp += 1
        elif pred == 1 and act == 0:
            fp += 1
        elif pred == 0 and act == 1:
            fn += 1
        elif pred == 0 and act == 0:
            tn += 1
    
    presition = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 =  2 * presition * recall / (presition + recall)
    accuracy = (tp + tn) / (tp + fp + fn + tn)

    return {"presition": presition, "recall": recall, "f1": f1, "accuracy": accuracy}
    


In [67]:
from sklearn.naive_bayes import MultinomialNB

def test_nv_classifier(X_train: np.ndarray, y_train, X_test: np.ndarray, y_test) -> dict:
    """
    Given X and y data for training and testing, returns the metrics of a 
    Multinomial Naive Bays model trained with X and y train data.
    """
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    return calculate_metrics(y_test, y_predicted)

In [68]:
import os

def get_docs_from_folder_paths(folder_paths: list, test_only = False):
    """
    Given a list of folder_paths, returns a list of train and test
    documents, which are extracted from the folder_paths.
    """
    train_docs = []
    test_docs = []
    for path in folder_paths:
        positive_path = os.path.join(path, "positive.review")
        negative_path = os.path.join(path, "negative.review")
        test_path = os.path.join(path, "unlabeled.review")

        f = open(test_path)
        line = f.readline()
        while line != "":
            test_docs.append(line)
            line = f.readline()
        f.close()

        if test_only:
            return test_docs

        f = open(positive_path)
        line = f.readline()
        while line != "":
            train_docs.append(line)
            line = f.readline()
        f.close()

        f = open(negative_path)
        line = f.readline()
        while line != "":
            train_docs.append(line)
            line = f.readline()
        f.close()

    return train_docs, test_docs

In [69]:
from sklearn.linear_model import LogisticRegression
def test_lr_classifier(X_train: np.ndarray, y_train, X_test: np.ndarray, y_test, \
                        penalty = None, solver = "lbfgs", max_iter = 100) -> dict:
    """
    Given X and y data for training and testing, returns the metrics of a 
    Logistic Regresion model trained with X and y train data. Refer to
    LogisticRegresion sklearn documentation for information on
    "penalty", "solver" and "max_iter" parameters.
    """
    model = LogisticRegression(penalty=penalty, solver=solver, max_iter=max_iter)
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    return calculate_metrics(y_test, y_predicted), model.coef_

In [70]:
def print_metrics(metrics: dict, end=None):
    print(metrics["presition"], metrics["recall"], metrics["f1"],\
          metrics["accuracy"], sep=";", end=end)

In [71]:
def print_weights(weights, map):
    weights = list(enumerate(weights[0, :]))
    weights = sorted(weights, key=lambda x: abs(x[1]), reverse=True)[0:5]
    print("WEIGHTS:", end="")
    for weight in weights:
        i = weight[0]
        label = map[i]
        print(label, ": ", abs(weight[1]), sep="", end=";")
    
    print()



In [72]:
def test_by_cat():
    """
    Builds and tests models for all four cathegories using the following
    vector representation of the documents:
        - Tf
        - Tfidf
        - Feature matrix
    Results are printed.
    """
    for cat in CATEGORY_PATHS:
        # === MULTINOMIAL NV ===
        print("=== Test for:", cat, "===")
        folder_path = CATEGORY_PATHS[cat]
        train_docs, test_docs = get_docs_from_folder_paths([folder_path])
        train_corpus = Corpus(train_docs)
        test_corpus = Corpus(test_docs, train_corpus.getVocabulary())
        y_train = train_corpus.getDocuments()["label"]
        y_test = test_corpus.getDocuments()["label"]

        # Test with TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs()
        X_test = test_corpus.getTfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")

        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs()
        X_test = test_corpus.getTfidfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")


        # Test with feature matrix
        print("Feature matrix: ", end="")
        X_train = train_corpus.getFeatureMatrix()
        X_test = test_corpus.getFeatureMatrix()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")


        # print("")

        # === LR ===
        print("=== LR for:", cat, "===")
        # Test with TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs().toarray()
        X_test = test_corpus.getTfs()
        metrics, weights = test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000)
        print(metrics)
        print_weights(weights, train_corpus.getVocLst())
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000), end=";")


        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs().toarray()
        X_test = test_corpus.getTfidfs()
        metrics, weights = test_lr_classifier(X_train, y_train, X_test, y_test)
        print(metrics)
        print_weights(weights, train_corpus.getVocLst())
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test), end=";")


        # Test with feature matrix
        print("Feature matrix: ", end="")
        X_train = train_corpus.getFeatureMatrix()
        X_test = test_corpus.getFeatureMatrix()
        metrics, weights = test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000)
        print(metrics)
        print_weights(weights, train_corpus.getMoodsLst())
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000), end=";")

        print("")

In [73]:
test_by_cat()

=== Test for: books ===
TFs: {'presition': 0.8158333333333333, 'recall': 0.8648409893992933, 'f1': 0.8396226415094339, 'accuracy': 0.832474804031355}
TFIDFs: {'presition': 0.827893175074184, 'recall': 0.8626325088339223, 'f1': 0.8449059052563271, 'accuracy': 0.839417693169093}
Feature matrix: {'presition': 0.6186172616357886, 'recall': 0.6046819787985865, 'f1': 0.6115702479338844, 'accuracy': 0.6105263157894737}
=== LR for: books ===
TFs: {'presition': 0.7816091954022989, 'recall': 0.8710247349823321, 'f1': 0.823898057238354, 'accuracy': 0.8111982082866741}
WEIGHTS:excellent: 37.24507250183929;bad: 31.587808695050633;boring: 26.352893753232596;disappointing: 22.9574894592133;easy: 22.57896255079269;
TFIDFs: {'presition': 0.8695848375451264, 'recall': 0.851148409893993, 'f1': 0.8602678571428571, 'accuracy': 0.8597984322508399}
WEIGHTS:excellent: 8.236708564381647;bad: 7.636252900271601;great: 7.102789921222948;waste: 6.711613188045029;boring: 6.650839822384189;
Feature matrix: {'presiti

## 3. Construcción del modelo y métricas con un compilado de todas las categorías

In [76]:
def test_complete():
    """
    Compiles a unified corpus using all the documents from all the cathegories.
    Then, builds and tests models for all four cathegories using the following
    vector representation of the documents:
        - Tf
        - Tfidf
        - Feature matrix
    Results are printed.
    """
    folder_paths = []
    for cat in CATEGORY_PATHS:
        folder_paths.append(CATEGORY_PATHS[cat])

    train_docs, test_docs = get_docs_from_folder_paths(folder_paths)
    train_corpus = Corpus(train_docs)
    y_train = train_corpus.getDocuments()["label"]

    printed_weights = 0
    for cat in CATEGORY_PATHS:
        print("Testing for", cat)
        path = CATEGORY_PATHS[cat]
        test_docs = get_docs_from_folder_paths([path], test_only=True)
        test_corpus = Corpus(test_docs, train_corpus.getVocabulary())
        y_test = test_corpus.getDocuments()["label"]

        # === NV ===
        print("NV")
        # TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs()
        X_test = test_corpus.getTfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")

        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs()
        X_test = test_corpus.getTfidfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")

        # Test with feature matrix
        print("Feature matrix:", end="")
        X_train = train_corpus.getFeatureMatrix()
        X_test = test_corpus.getFeatureMatrix()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))
        # print_metrics(test_nv_classifier(X_train, y_train, X_test, y_test), end=";")

        # print()

        # === LR ===
        print("LR")
        # TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs()
        X_test = test_corpus.getTfs()
        metrics, weights = test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000)
        print(metrics)
        if printed_weights < 3:
            print_weights(weights, train_corpus.getVocLst())
            printed_weights += 1
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000), end=";")

        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs()
        X_test = test_corpus.getTfidfs()
        metrics, weights = test_lr_classifier(X_train, y_train, X_test, y_test)
        print(metrics)
        if printed_weights < 3:
            print_weights(weights, train_corpus.getVocLst())
            printed_weights += 1
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test), end=";")


        # Test with feature matrix
        print("Feature matrix:", end="")
        X_train = train_corpus.getFeatureMatrix()
        X_test = test_corpus.getFeatureMatrix()
        metrics, weights = test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000)
        print(metrics)
        if printed_weights < 3:
            print_weights(weights, train_corpus.getMoodsLst())
            printed_weights += 1
        # print_metrics(test_lr_classifier(X_train, y_train, X_test, y_test, max_iter=4000), end=";")


        print()


In [77]:
test_complete()

Testing for books
NV
TFs: {'presition': 0.7981438515081206, 'recall': 0.911660777385159, 'f1': 0.8511340206185567, 'accuracy': 0.8382978723404255}
TFIDFs: {'presition': 0.8163841807909604, 'recall': 0.8935512367491166, 'f1': 0.8532264867144665, 'accuracy': 0.844120940649496}
Feature matrix:{'presition': 0.6261930010604454, 'recall': 0.5216431095406361, 'f1': 0.5691566265060242, 'accuracy': 0.5995520716685331}
LR
TFs: {'presition': 0.8279248505550811, 'recall': 0.8564487632508834, 'f1': 0.8419452887537994, 'accuracy': 0.8369540873460246}
WEIGHTS:excellent: 76.02172434422741;disappointing: 64.37684487765047;waste: 61.127173477835825;boring: 59.566383395697635;terrible: 55.16151092296514;
TFIDFs: {'presition': 0.8689105403011514, 'recall': 0.8666077738515902, 'f1': 0.8677576293675365, 'accuracy': 0.8660694288913774}
WEIGHTS:great: 20.3351464280531;excellent: 17.55605045493851;not: 16.279194895570335;bad: 14.31918989044432;waste: 13.788237297348847;
Feature matrix:{'presition': 0.638856476