In [1]:
# Importaciones requeridas del programa

import requests
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import RandomForestClassifier as RF

  from numpy.core.umath_tests import inner1d


In [2]:
def tokenizarFrase(frase):    
    archivoTemporalConFrase = open('temporalFile.txt','w')    
    archivoTemporalConFrase.write(frase)
    archivoTemporalConFrase.close()

    file = {'file': open('temporalFile.txt', 'r')}
    params = {'outf': 'tagged', 'format': 'json'}

    url = "http://www.corpus.unam.mx/servicio-freeling/analyze.php"
    r = requests.post(url, files=file, params=params)
    obj = r.json()
    return obj

In [3]:
def obtenerDictionaryDeSenticol():
    archivoDeBaseDeDatos = open('MLSenticon.txt','r',  encoding='cp1252')
    palabras = archivoDeBaseDeDatos.read().split('\n')
    diccionarioDePalabrasConSentimientos = {}    
    for palabra in palabras:
        tmpPalabraConSentimiento = palabra.split('\t')
        diccionarioDePalabrasConSentimientos[tmpPalabraConSentimiento[0]] = tmpPalabraConSentimiento[1]
    return diccionarioDePalabrasConSentimientos

In [4]:
def leerBaseDeDatosDeReviews():
    archivoDeBaseDeDatos = open('database.txt','r')
    reviewsData = archivoDeBaseDeDatos.read().split('\n')
    archivoDeBaseDeDatos.close()
    reviews, tipoDeReview = [],[] 
    for reviewData in reviewsData:
        temporalData = reviewData.split('\t')
        reviews.append(temporalData[0])
        tipoDeReview.append(temporalData[1])
    return reviews, tipoDeReview

In [5]:
def representacionVectorial(reviewTokenizado, diccionarioDeSentimientos):    
    representacion = [0, 0]
    palabrasDeNegacion = ['pero', 'no']
    finalizacionDeFrase = ['.', ',']
    signoParaMultiplicarLasPalabras = 1
    representacionTemporal = [0, 0]
    
    
    for sentence in reviewTokenizado: 
        for word in sentence:
            lemma = word['lemma']
            if (lemma in finalizacionDeFrase):
                representacion[0] += representacionTemporal[0] if (signoParaMultiplicarLasPalabras > 0) else representacionTemporal[1]
                representacion[1] += representacionTemporal[1] if (signoParaMultiplicarLasPalabras > 0) else representacionTemporal[0]
                representacionTemporal = [0, 0]
            elif (lemma in palabrasDeNegacion):
                signoParaMultiplicarLasPalabras = signoParaMultiplicarLasPalabras * (- 1);
            elif (word['tag'][0] == 'A' and lemma in diccionarioDeSentimientos.keys()):
                representacionTemporal[0] = representacionTemporal[0] + (float(diccionarioDeSentimientos[lemma]) if (float(diccionarioDeSentimientos[lemma]) > 0) else 0)
                representacionTemporal[1] = representacionTemporal[1] + (float(diccionarioDeSentimientos[lemma]) if (float(diccionarioDeSentimientos[lemma]) < 0) else 0)
            representacion[0] += representacionTemporal[0] if (signoParaMultiplicarLasPalabras > 0) else representacionTemporal[1]
            representacion[1] += representacionTemporal[1] if (signoParaMultiplicarLasPalabras > 0) else representacionTemporal[0]
            representacionTemporal = [0, 0]
    representacion[1] *= -1 if (representacion[1] < 0) else 1
    representacion[0] *= -1 if (representacion[0] < 0) else 1

    return representacion

In [6]:
def error_measures(Yestimado, Yteorico):
    
    CM = confusion_matrix(Yteorico, Yestimado)

    TN = CM[0][0]
    FN = CM[1][0]
    TP = CM[1][1]
    FP = CM[0][1]
    
    sens = TP/(TP+FN)
    esp = TN/(TN+FP)
    
    return sens, esp

In [7]:
def vecinosMasCercanos(data, target):
    model = KNN()
    acc = []
    sens = []
    esp = []

    for i in range(100):

        Xtrain, Xtest, Ytrain, Ytest = train_test_split(data, target)
        model.fit(Xtrain,Ytrain)
        
        Yest = model.predict(Xtest)
        s, e = error_measures(Yest,Ytest)
        sens.append(s); esp.append(e)
        acc.append(model.score(Xtest,Ytest))
    print("\nResultados con Regresión logística (Lineal)\n")
    print("Accuracy: ", np.mean(acc), "+/-", np.std(acc))
    print("Sensitivity: ", np.mean(sens), "+/-", np.std(sens))
    print("Especificity: ", np.mean(esp), "+/-", np.std(esp))

In [8]:
def regresionLogistica(data, target):
    lr = LogisticRegression()
    acc = []
    sens = []
    esp = []

    for i in range(100):
        Xtrain, Xtest, Ytrain, Ytest = train_test_split(data, target)
        lr.fit(Xtrain,Ytrain)        
        Yest = lr.predict(Xtest)
        s, e = error_measures(Yest,Ytest)
        sens.append(s); esp.append(e)
        acc.append(lr.score(Xtest,Ytest))
        
    print("\nResultados con Regresión logística (Lineal)\n")
    print("Accuracy: ", np.mean(acc), "+/-", np.std(acc))
    print("Sensitivity: ", np.mean(sens), "+/-", np.std(sens))
    print("Especificity: ", np.mean(esp), "+/-", np.std(esp))

In [9]:
def clasificadorRandomForest(data, target):
    model = RF()
    acc = []
    sens = []
    esp = []

    for i in range(100):

        Xtrain, Xtest, Ytrain, Ytest = train_test_split(data, target)
        model.fit(Xtrain,Ytrain)
        
        Yest = model.predict(Xtest)
        s, e = error_measures(Yest,Ytest)
        sens.append(s); esp.append(e)
        acc.append(model.score(Xtest,Ytest))
    print("\nResultados con Regresión logística (Lineal)\n")
    print("Accuracy: ", np.mean(acc), "+/-", np.std(acc))
    print("Sensitivity: ", np.mean(sens), "+/-", np.std(sens))
    print("Especificity: ", np.mean(esp), "+/-", np.std(esp))

In [10]:
diccionarioDeSentimientos = obtenerDictionaryDeSenticol()
data, target = leerBaseDeDatosDeReviews()
dataRepresentadaEnVectores = []

for review in data:
    dataRepresentadaEnVectores.append(representacionVectorial(tokenizarFrase(review), diccionarioDeSentimientos))

regresionLogistica(dataRepresentadaEnVectores, target)
vecinosMasCercanos(dataRepresentadaEnVectores, target)
clasificadorRandomForest(dataRepresentadaEnVectores, target)


Resultados con Regresión logística (Lineal)

Accuracy:  0.7525 +/- 0.0744460837637219
Sensitivity:  0.6411563177964106 +/- 0.11463012254642858
Especificity:  0.8887050678080091 +/- 0.08530670718040971

Resultados con Regresión logística (Lineal)

Accuracy:  0.6996428571428571 +/- 0.10511109283389025
Sensitivity:  0.725768832732455 +/- 0.1765907776119261
Especificity:  0.6855135836385837 +/- 0.3171868663973595

Resultados con Regresión logística (Lineal)

Accuracy:  0.6960714285714285 +/- 0.07684715150138977
Sensitivity:  0.5364834714219389 +/- 0.12449217508663374
Especificity:  0.8799413927575693 +/- 0.08205391029730864
