In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import pickle
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
import math

In [None]:
def openfile(fileName):
    """
    Dado el nombre de un fichero devuelve el corpus preprocesado y las etiquetas
    """
    with open(fileName,"rb") as fh:
        texto,etiquetas = pickle.load(fh)
    return texto,etiquetas

In [None]:
def saveRep(representacion,etiquetas, outputFile):
    """
    Vuelca la representación y las etiquetas en un fichero
    """
    with open(outputFile,"wb") as fh:
        object = representacion,etiquetas
        pickle.dump(object,fh)

In [None]:
def representationBOW(corpus,character=False):
    """
    Dado un corpus de obtiene la representacion  Bag of Words eliminando las stopwords en español
    Si max_df o min_df es un real --> proporcion de documentos.
    **** En nuestro caso: ignorar aquellos terminos que aparezcan en todos los documentos
    Si max_df o min_df es un entero --> Conteo de terminos
    **** En nuestro caso: ignorar aquellos terminos que aparezcan en menos de dos documentos
    """
    if character:
        vectorizerTrain = TfidfVectorizer(ngram_range = (3,5),max_df=0.95,min_df=2,analyzer="char_wb")
    else:
        vectorizerTrain = TfidfVectorizer(ngram_range = (1,2),stop_words=stopwords.words("spanish"),max_df=0.95,min_df=2)

    bow = vectorizerTrain.fit_transform(corpus)
    return bow,vectorizerTrain

In [None]:
textoTrain, etiquetasTrain = openfile("preproceso_train")
textoTest, etiquetasTest = openfile("preproceso_test")
print("Lectura de ficheros correcta. Documentos de train: %d Documentos de test: %d " %(len(textoTrain),len(textoTest)))

In [None]:
def intervalo95(p,datos):
    """
    Calcula el intervalo al 95% 
    :param p: probabilidad de acierto
    :param datos: numero de datos del conjunto de test
    """
    e = 1.96*math.sqrt((p*(1-p))/datos)
    s = "[%.3f , %.3f]" % (p-e,p+e)
    return s

In [None]:
def cross_validation(clasificador,xtrain,xlabels,bloques):
    scores = cross_val_score(clasificador, xtrain, xlabels, cv=bloques, scoring='f1_macro')
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    return scores.mean()

# Evaluación de n-gramas

In [None]:
bowTrain,vectorizerTrain = representationBOW(textoTrain)
print("Representacion Train OK ")
bowTest = vectorizerTrain.transform(textoTest)
print("Representacion Test OK")

In [None]:
cs = [1,10,100,1000,10000]
kernel= ["linear","rbf"]
res = "c \t \t k \t \t Accuracy \t \t Inter95 \n"
res += "---------------------SVM--------------------------\n"
for k in kernel:
    for c in cs:
        clf = svm.SVC(C = c,kernel=k)
        clf.fit(bowTrain,etiquetasTrain)
        #media = cross_validation(clf,bowTrainCaracteres,etiquetasTrain,10)
        p = clf.score(bowTest,etiquetasTest)
        aux = "%d \t \t  %s \t \t %.4f \t \t %s \n" % (c,k[0:3],p, intervalo95(p,len(etiquetasTest)))
        print(aux)
        res+= aux
print(res)

In [None]:
cs = [1,10,100,1000,10000]
res = "c \t \t Accuracy \t \t Inter95 \n"
res += "---------------------Logistic--------------------------\n"
for c in cs:
    clf = LogisticRegression(C=c)
    clf.fit(bowTrain,etiquetasTrain)
    #media = cross_validation(clf,bowTrainCaracteres,etiquetasTrain,10)
    p = clf.score(bowTest,etiquetasTest)
    aux = "%d \t \t %.3f \t \t %s \n" %(c,p,intervalo95(p,len(etiquetasTest)))
    print(aux)
    res+= aux
print(res)

# Evaluación Char-ngramas

In [None]:
bowTrainCaracteres,vectorizerTrainCaracteres = representationBOW(textoTrain, character=True)
bowTestCaracteres = vectorizerTrainCaracteres.transform(textoTest)

In [None]:
cs = [1,10,100,1000,10000]
kernel= ["linear","rbf"]
res = "c \t \t k \t \t Accuracy \t \t Inter95 \n"
res += "---------------------SVM--------------------------\n"
for k in kernel:
    for c in cs:
        clf = svm.SVC(C = c,kernel=k)
        clf.fit(bowTrainCaracteres,etiquetasTrain)
        #media = cross_validation(clf,bowTrainCaracteres,etiquetasTrain,10)
        p = clf.score(bowTestCaracteres,etiquetasTest)
        aux = "%d \t \t  %s \t \t %.4f \t \t %s \n" % (c,k[0:3],p, intervalo95(p,len(etiquetasTest)))
        print(aux)
        res+= aux
print(res)

In [None]:
cs = [1,10,100,1000,10000]
res = "c \t \t Accuracy \t \t Inter95 \n"
res += "---------------------Logistic--------------------------\n"
for c in cs:
    clf = LogisticRegression(C=c)
    clf.fit(bowTrainCaracteres,etiquetasTrain)
    #media = cross_validation(clf,bowTrainCaracteres,etiquetasTrain,10)
    p = clf.score(bowTestCaracteres,etiquetasTest)
    aux = "%d \t \t %.3f \t \t %s \n" %(c,p,intervalo95(p,len(etiquetasTest)))
    print(aux)
    res+= aux
print(res)

In [None]:
saveRep(bowTrainCaracteres,etiquetasTrain,"ncharRepTrain")
saveRep(bowTestCaracteres,etiquetasTest,"ncharRepTest")