# Multy-Domain Sentiment Dataset: Sentiment Analisys per domain

## 1. Preprocesamiento
Para el preprocesamiento, se creó la clase "Corpus" que recibe una serie de documentos y obtiene las etiquetas de cada uno, el vocabulario, y las representaciones tf y tfidf.

In [57]:
import os
import pandas as pd
import numpy as np
import math
from scipy.sparse import coo_array

POSITIVE_LABEL = 1
NEGATIVE_LABEL = 0

CATEGORY_PATHS = {
    "books": "../data/books/",
    "dvd": "../data/dvd/",
    "electronics": "../data/electronics/",
    "kitchen": "../data/kitchen/",
}


class Corpus:
    # Atributes
    __vocabulary = {}
    __tf = None
    __tfidf = None
    __documents = None
    __doc_frequencies = None


    def __init__(self, docs: list, vocabulary: dict = None) -> None:
        """
        Creates the vocabulary, the TF and the TFIDF matricees from docs.
        
            Params
            ------
                docs: pd.DataFrame
                    List of documents.
                
                vocabulary: dict | None (default: None)
                    A dictionary where the keys are the terms in the vocabulary
                    and the values are each term's unique id. If set to None,
                    this class will create the vocabulary based on docs.
                    WARNING: if specified, vocabulary must contain a "#UNK#" key
                    for unknown terms.
        """        
        self.__documents = self.__get_docs_df(docs)
        if vocabulary is None:
            self.__vocabulary = self.__load_vocabulary()
        else:
            self.__vocabulary = vocabulary
            
        self.__tf = self.__load_tf()
        self.__tfidf = self.__load_tfidf()

    def __get_docs_df(self, docs: list):
        documents = []
        for doc in docs:
            terms = self.__get_term_counts(doc)
            label = self.__get_label(doc)
            documents.append([terms, label])
        
        return pd.DataFrame(documents, columns=["terms", "label"])

    def __get_term_counts(self, line: str) -> list:
        line_arr = line.split()
        line_arr = line_arr[:-1] # Remove #label#: from the end of array
        terms = {}
        for term in line_arr:
            term_arr = term.split(":")
            terms[term_arr[0]] = int(term_arr[1])
        
        return terms
    
    def __get_label(self, doc: str) -> int:
        doc_arr = doc.split()
        label_str = doc_arr[-1]
        label = label_str.split(":")[-1]
        if label.lower() == "negative":
            return NEGATIVE_LABEL
        elif label.lower() == "positive":
            return POSITIVE_LABEL
    
    def __load_vocabulary(self):
        # Term counts in the whole corpus
        voc = {}
        docs = self.__documents
        for i in range(len(docs)):
            terms_dict = docs.loc[i, "terms"]
            for term in terms_dict:
                if term in voc:
                    voc[term] += terms_dict[term]
                else:
                    voc[term] = terms_dict[term]
        
        # Replace terms with one appearance with UNK
        terms_to_del = []
        for term in voc:
            if voc[term] == 1:
                terms_to_del.append(term)

        # Remove terms that appeare only once
        for term in terms_to_del:
            voc.pop(term)
        
        # Assign unique ids to terms in vocabulary
        bow = {"#UNK#": 0}
        i = 1
        for term in voc:
            bow[term] = i
            i += 1
        
        return bow

    def __load_tf(self) -> np.ndarray:
        voc = self.__vocabulary
        docs = self.__documents
        data = []
        x = []
        y = []
        for i in range(len(docs)):
            doc = docs.iloc[i]
            for term in doc["terms"]:
                if term in voc:
                    data.append(doc["terms"][term])
                    x.append(i)
                    y.append(voc[term])
                else:
                    data.append(doc["terms"][term])
                    x.append(i)
                    y.append(voc["#UNK#"])
        
        sparce_matrix = coo_array((data, (x,y)), shape=(len(docs), len(voc)), dtype=np.uint64).tocsr()
        return sparce_matrix

    def __load_tfidf(self) -> np.ndarray:
        voc = self.__vocabulary
        docs = self.__documents
        tfs = self.__tf
        doc_frec = np.zeros(len(voc))
        # Calculate document frequencies
        for i in range(len(docs)):
            doc = docs.loc[i, "terms"]
            added_unk = False
            for term in doc:
                if term in voc:
                    term_id = voc[term]
                    doc_frec[term_id] += 1
                elif not added_unk:
                    term_id = voc["#UNK#"]
                    doc_frec[term_id] += 1
                    added_unk = True

        total_docs = len(docs)
        data = []
        x = []
        y = []
        for i in range(len(docs)):
            doc = docs.iloc[i]
            unk_tf = 0
            for term in doc["terms"]:
                if term in voc:
                    term_id = voc[term]
                    tf = doc["terms"][term]
                    df = int(doc_frec[term_id])
                    data.append(math.log10(1+tf) * math.log10(total_docs / df))
                    x.append(i)
                    y.append(term_id)
                else:
                    unk_tf += doc["terms"][term]
            
            # Calculate UNK tfidf
            term_id = voc["#UNK#"]
            tf = int(tfs[i, term_id])
            df = int(doc_frec[term_id])
            temp = math.log10(1+tf) * math.log10(total_docs / df)
            if temp < 0:
                raise Exception("HP")
            data.append(math.log10(1+tf) * math.log10(total_docs / df))
            x.append(i)
            y.append(term_id)

        sparce_matrix = coo_array((data, (x, y)), shape=(len(docs), len(voc)), dtype=np.float64).tocsr()        
        return sparce_matrix

    def getVocabulary(self) -> dict:
        return self.__vocabulary
    
    def getDocuments(self) -> pd.DataFrame:
        return self.__documents

    def getTfs(self):
        return self.__tf
    
    def getTfidfs(self):
        return self.__tfidf

# 2. Construcción del modelo y métricas por categoría

In [48]:
def calculate_metrics(labels: list, predicted_labels: list) -> dict:
    tp = 0
    fp = 0
    fn = 0
    tn = 0

    for i in range(len(labels)):
        act     = labels[i]
        pred    = predicted_labels[i]
        if(act == 1 and pred == 1):
            tp += 1
        elif pred == 1 and act == 0:
            fp += 1
        elif pred == 0 and act == 1:
            fn += 1
        elif pred == 0 and act == 0:
            tn += 1
    
    presition = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 =  2 * presition * recall / (presition + recall)

    return {"presition": presition, "recall": recall, "f1": f1}
    


In [49]:
from sklearn.naive_bayes import MultinomialNB

def test_nv_classifier(X_train: np.ndarray, y_train, X_test: np.ndarray, y_test) -> dict:
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    return calculate_metrics(y_test, y_predicted)

In [50]:
import os

def get_docs_from_folder_paths(folder_paths: list):
    train_docs = []
    test_docs = []
    for path in folder_paths:
        positive_path = os.path.join(path, "positive.review")
        negative_path = os.path.join(path, "negative.review")
        test_path = os.path.join(path, "unlabeled.review")

        f = open(positive_path)
        line = f.readline()
        while line != "":
            train_docs.append(line)
            line = f.readline()
        f.close()

        f = open(negative_path)
        line = f.readline()
        while line != "":
            train_docs.append(line)
            line = f.readline()
        f.close()

        f = open(test_path)
        line = f.readline()
        while line != "":
            test_docs.append(line)
            line = f.readline()
        f.close()

    return train_docs, test_docs

In [51]:
def test_multinomial_nv_by_cat():
    for cat in CATEGORY_PATHS:
        print("=== Multinomial NV for:", cat, "===")
        folder_path = CATEGORY_PATHS[cat]
        train_docs, test_docs = get_docs_from_folder_paths([folder_path])
        train_corpus = Corpus(train_docs)
        test_corpus = Corpus(test_docs, train_corpus.getVocabulary())
        y_train = train_corpus.getDocuments()["label"]
        y_test = test_corpus.getDocuments()["label"]

        # Test with TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs()
        X_test = test_corpus.getTfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))

        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs()
        X_test = test_corpus.getTfidfs()
        print(test_nv_classifier(X_train, y_train, X_test, y_test))

In [59]:
test_multinomial_nv_by_cat()

=== Multinomial NV for: books ===
TFs: {'presition': 0.8158333333333333, 'recall': 0.8648409893992933, 'f1': 0.8396226415094339}
TFIDFs: {'presition': 0.827893175074184, 'recall': 0.8626325088339223, 'f1': 0.8449059052563271}
=== Multinomial NV for: dvd ===


KeyboardInterrupt: 

In [69]:
from sklearn.linear_model import LogisticRegression
def test_lr_classifier(X_train: np.ndarray, y_train, X_test: np.ndarray, y_test) -> dict:
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    return calculate_metrics(y_test, y_predicted)

In [70]:
def test_lr_classifier_by_cat():
    for cat in CATEGORY_PATHS:
        print("=== LR for:", cat, "===")
        folder_path = CATEGORY_PATHS[cat]
        train_docs, test_docs = get_docs_from_folder_paths([folder_path])
        train_corpus = Corpus(train_docs)
        test_corpus = Corpus(test_docs, train_corpus.getVocabulary())
        y_train = train_corpus.getDocuments()["label"]
        y_test = test_corpus.getDocuments()["label"]

        # Test with TFs
        print("TFs: ", end="")
        X_train = train_corpus.getTfs().toarray()
        X_test = test_corpus.getTfs()
        print(test_lr_classifier(X_train, y_train, X_test, y_test))

        # Test with TFIDFs
        print("TFIDFs: ", end="")
        X_train = train_corpus.getTfidfs().toarray()
        X_test = test_corpus.getTfidfs()
        print(test_lr_classifier(X_train, y_train, X_test, y_test))
        

In [71]:
test_lr_classifier_by_cat()

=== LR for: books ===
TFs: 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'presition': 0.8192617734408146, 'recall': 0.8529151943462897, 'f1': 0.835749837697468}
TFIDFs: {'presition': 0.8560965101249461, 'recall': 0.8776501766784452, 'f1': 0.8667393675027263}
=== LR for: dvd ===


## 3. Construcción del modelo y métricas con un compilado de todas las categorías

In [None]:
def test_nv_classifier_complete():
    print("=== TEST NV FOR FULL CORPUS ===")
    folder_paths = []
    for cat in CATEGORY_PATHS:
        folder_paths.append(CATEGORY_PATHS[cat])

    train_docs, test_docs = get_docs_from_folder_paths(folder_paths)
    train_corpus = Corpus(train_docs)
    test_corpus = Corpus(test_docs, train_corpus.getVocabulary())
    y_train = train_corpus.getDocuments()["label"]
    y_test = test_corpus.getDocuments()["label"]

    # Test with TFs
    print("TFs: ", end="")
    X_train = train_corpus.getTfs()
    X_test = test_corpus.getTfs()
    print(test_lr_classifier(X_train, y_train, X_test, y_test))

    # Test with TFIDFs
    print("TFIDFs: ", end="")
    X_train = train_corpus.getTfidfs()
    X_test = test_corpus.getTfidfs()
    print(test_lr_classifier(X_train, y_train, X_test, y_test))

In [None]:
test_nv_classifier_complete()

=== TEST NV FOR FULL CORPUS ===


MemoryError: Unable to allocate 7.39 GiB for an array with shape (8000, 123996) and data type float64