In [None]:
from os import listdir
from os.path import isfile,join
import xml.etree.ElementTree as ET
import re
from nltk.stem import SnowballStemmer
import pickle

In [None]:
def saveObject(texto, labels, outputFile):
    """
    Vuelca el preproceso del texto y las etiquetas en un fichero
    """
    with open(outputFile,"wb") as fh:
        object = (texto,labels)
        pickle.dump(object,fh)

In [None]:
def Stemming(text):
    """
    Devuelve el stemming de un determinado texto
    """
    stemmer  = SnowballStemmer("spanish")
    textosalida = " ".join([stemmer.stem(w) for w in text.split(" ")])
    return textosalida

In [None]:
clean_re = re.compile('\W+')
url_re = re.compile("https?://[^\s]+")
hashtag_re = re.compile("#(\w+)")
mention_re = re.compile("@(\w+)")
def preprocessing(text):
    """
    Realiza el preprocesado de un determinado texto:
    1- sustituye las urls por la palabra <url>
    2- sustituye los hashtags por la palabra <hashtag>
    3- sustituye las menciones por la palabra <mencion>
    4- sustituye los numeros por la palabra <numero>
    """
    text_clean = url_re.sub("<url>",text)
    text_clean = hashtag_re.sub("<hashtag>", text_clean)
    text_clean = mention_re.sub("<mencion>", text_clean)
    text_clean =re.sub("\d+", "<numero>", text_clean)
    #text_clean = clean_re.sub(" ",text_clean).lower()
    text_clean = text_clean.lower()
    #text_clean = Stemming(text_clean)
    
    return text_clean 

In [None]:
def readXML(filename):
    """
    Dado el nombre de un fichero en formato XML:
    obtiene el corpus de tweets (preprocesados)
    Concatena todos los tweets con la etiqueta <FinTweet>
    """
    tree = ET.parse(filename)
    root = tree.getroot()
    i=0
    tweets =root.find("documents")
    author = " <FinTweet> ".join([tweet.text for tweet in tweets])
    return preprocessing(author)


In [None]:
#labels Train
tr_label_file = open("/Volumes/MARCOS E/PAN2018/train/es.txt")
labels_b = [author.split(":::") for author in tr_label_file.read().split("\n")]
labels = {}
for author in labels_b:
    if len(author) > 1:
        labels[author[0]] = author[1]
print("OK")

In [None]:
#labels Test
te_label_file = open("/Volumes/MARCOS E/PAN2018/test/es.txt")
labels_b = [author.split(":::") for author in te_label_file.read().split("\n")]
labels_test = {}
for author in labels_b:
    if len(author) > 1:
        labels_test[author[0]] = author[1]
print("OK")


In [None]:
#Corpus / Etiquetas Train
dirTrain = "/Volumes/MARCOS E/PAN2018/train/text"
filesTrain = [f for f in listdir(dirTrain)]
globalCorpusTrain = []
globalLabelsTrain = []
for filename in filesTrain:
    name =filename.split(".")
    if labels.get(name[0]) is not None: #si existe un autor con el mismo nombre en es.txt
        corpusLocal = readXML(dirTrain+"/"+filename)
        globalCorpusTrain.append(corpusLocal)
        globalLabelsTrain.append(-1 if labels[name[0]] == "male" else 1)
print("Lectura train OK")

In [None]:
saveObject(globalCorpusTrain,globalLabelsTrain,"preproceso_train")
print("Preproceso del texto guardado correctamente")

In [None]:
#Corpus / Etiquetas Test
dirTest = "/Volumes/MARCOS E/PAN2018/test/es/text"
filesTrain = [f for f in listdir(dirTest)]
globalCorpusTest = []
globalLabelsTest = []
for filename in filesTrain:
    name =filename.split(".")
    if labels_test.get(name[0]) is not None:
        corpusLocal = readXML(dirTest+"/"+filename)
        globalCorpusTest.append(corpusLocal)
        globalLabelsTest.append(-1 if labels_test[name[0]] == "male" else 1)
print("Lectura test OK")

In [None]:
saveObject(globalCorpusTest,globalLabelsTest,"preproceso_test")
print("Preproceso del texto guardado correctamente")