# Multy-Domain Sentiment Dataset: Sentiment Analisys per domain

In [6]:
import os
import pandas as pd
import numpy as np
import math

POSITIVE_LABEL = 1
NEGATIVE_LABEL = 0

class Corpus:
    # Atributes
    __documents = None
    __vocabulary = {}
    __tf = []
    __tfidf = []


    def __init__(self, foler_path) -> None:
        path_positives = os.path.join(foler_path, "positive.review")
        path_negatives = os.path.join(foler_path, "negative.review")

        df1 = pd.DataFrame(self.__load_documents(path_positives, POSITIVE_LABEL), columns=["terms", "label"])
        df2 = pd.DataFrame(self.__load_documents(path_negatives, NEGATIVE_LABEL), columns=["terms", "label"])
        df = pd.concat([df1, df2], ignore_index=True)
        self.__documents = df
        self.__load_vocabulary()
        self.__load_tf()
        print("Done tf")
        self.__load_tfidf()
        print(self.__tfidf)

    def __load_documents(self, path: str, label):
        documents = []
        f = open(path)
        line = f.readline()
        while(line != ""):
            terms = self.__get_term_counts(line)
            doc = [terms, label]
            documents.append(doc)
            line = f.readline()
        f.close()
        return documents

    def __get_term_counts(self, line: str) -> list:
        line_arr = line.split()
        line_arr = line_arr[:-1] # Remove #label#: from the end of array
        terms = {}
        for term in line_arr:
            term_arr = term.split(":")
            terms[term_arr[0]] = term_arr[1]
        
        return terms
    
    def __load_vocabulary(self):
        voc = set()
        docs = self.__documents
        for i in range(len(docs)):
            terms_dict = docs.loc[i, "terms"]
            voc.update(list(terms_dict.keys()))
        
        self.__vocabulary = {word: index for index, word in enumerate(voc)}

    def __load_tf(self):
        voc = self.__vocabulary
        docs = self.__documents
        tf = np.zeros((len(docs), len(voc)))

        for i in range(len(docs)):
            doc = docs.iloc[i]
            for term in doc["terms"]:
                tf[i][voc[term]] = doc["terms"][term]
        
        self.__tf = tf

    def __load_tfidf(self):
        voc = self.__vocabulary
        docs = self.__documents
        tfs = self.__tf
        doc_frec = np.count_nonzero(tfs, axis=0)
        total_docs = len(docs)
        tfidf = np.zeros((len(docs), len(voc)))
        for i in range(len(docs)):
            df = float(doc_frec[i])
            for j in range(len(voc)):
                tf = float(tfs[i][j])
                tfidf[i][j] = math.log10(1+tf) * math.log10(total_docs / df)
        
        self.__tfidf = tfidf


    
Corpus("../data/books/")

Done tf
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
