In [1]:
import json
import nltk
import operator
from nltk.corpus import stopwords as sw

directory='/home/patrosau@alumno.upv.es/Descargas/hispatweets/'

def getFileName(directory, country, userId):
    return directory + country + '/' + user + '.json'

In [2]:
from nltk.tokenize import TweetTokenizer

def tokenizeTweet(tweet):

    # eliminamos mencion de usuarios, quitamos las repeticiones de mas de 3 letras y pasamos a minusculas
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
    
    return tokenizer.tokenize(tweet)

In [3]:
# Diccionario por sexo de los ficheros de training
sexdic = dict()
with open(directory + 'training.txt') as trainingFile:
    for line in trainingFile:
        [user, country, sex] = line.split(':::')
        file_Name = getFileName(directory, country, user)
        
        if (sex in sexdic):
            sexdic[sex].append(file_Name)
        else:
            sexdic[sex] = list()
            sexdic[sex].append(file_Name)
        

In [4]:
# calcula frecuencias de aparicion de palabras en los tweets de entrada, en este caso sin eliminar nada
# para discriminar entre hombre y mujer vamos a tenerlo todo en cuenta
def getFrequentWords(files):

    sw_spanish = sw.words('spanish')
    d = dict()

    for file in files:
        with open(file) as f:
            for line in f:
                tokens = tokenizeTweet(json.loads(line)['text'])
                for word in tokens:
                    d[word] = d.get(word, 0) + 1

    return sorted(d.items(), key=operator.itemgetter(1), reverse=True)

In [5]:
# ver la frecuencia de palabras utilizadas por cada sexo
frequentWordsPerSex = dict()
for sex in sexdic:
    frequentWordsPerSex[sex] = getFrequentWords(sexdic[sex])

In [9]:
def wordUserByOtherSex(inWord, inSex):
    for sex in frequentWordsPerSex:
        if (sex != inSex):
            for word in frequentWordsPerSex[sex][:500]:
                if (word[0] == inWord ):
                    return 1
    return 0

In [13]:
# de las 500 palabras mas utilizadas de cada sexo ver cuales no aparecen en los otros sexos (tarda mucho)
distinctWordsPerSex = dict()
for sex in frequentWordsPerSex:
    distinctWordsPerSex[sex] = list()
    for word in frequentWordsPerSex[sex][:500]:
        if (wordUserByOtherSex(word[0], sex) == 0):
            distinctWordsPerSex[sex].append(word[0])

In [16]:
bolsaPalabras = list()
for sex in distinctWordsPerSex:
    for word in distinctWordsPerSex[sex]:
        bolsaPalabras.append(word)
len(bolsaPalabras)

91

In [17]:
def palabraIsUsed(palabra, uw):
    for p in uw:
        if p[0] == palabra:
            return 1
    return 0

In [18]:
def getFeaturesList(fileName):
    res = list()
    palabrasUsadas = getFrequentWords(fileName)
    for palabra in bolsaPalabras:
            if palabraIsUsed(palabra,palabrasUsadas):
                res.append(1)
            else:
                res.append(0)
    return res

In [19]:
def getTrainingData(user, country):
    listFiles = list()
    listFiles.append(getFileName(directory, country, user))
    userVector = getFeaturesList(listFiles)
    return userVector

In [21]:
with open(directory + 'sexwordstrainingdata.txt', 'w') as train:
    with open(directory + 'training.txt') as trainingFile:
        for line in trainingFile:
            [user, country, sex] = line.split(':::')
            train.write(user + ',')
            userVector = getTrainingData(user, country)
            for feature in userVector:
                train.write(str(feature) + ',')
            train.write(sex)

In [22]:
with open(directory + 'sexwordstestdata.txt', 'w') as train:
    with open(directory + 'test.txt') as trainingFile:
        for line in trainingFile:
            [user, country, sex] = line.split(':::')
            train.write(user + ',')
            userVector = getTrainingData(user, country)
            for feature in userVector:
                train.write(str(feature) + ',')
            train.write(sex)

In [26]:
# merge word features and counters features (sextrainingdata.txt and sexwordstrainingdata.txt)
def mergeData(wordsFile, countersFile):
    wordsDataPerUser = dict()
    with open(directory + wordsFile) as file:
            for line in file:
                values = line.split(',')
                if (values[0] != '\n'):
                    wordsDataPerUser[values[0]] = values[1:len(values) - 2];
                    
    with open(directory + countersFile) as countersFile:
        for line in countersFile:
            values = line.split(',')
            if (values[0] != '\n' ):
                for v in values[1:len(values) - 2]:
                    wordsDataPerUser[values[0]].append(v);
    
    return wordsDataPerUser
                    
                    
        

In [27]:
trainingDataPerUser = mergeData('sexwordstrainingdata.txt', 'sextrainingdata.txt');

In [29]:
with open(directory + 'sexmergetrainingdata.txt', 'w') as train:
    with open(directory + 'training.txt') as trainingFile:
        for line in trainingFile:
            [user, country, sex] = line.split(':::')
            train.write(user + ',')
            userVector = trainingDataPerUser[user]
            for feature in userVector:
                train.write(str(feature) + ',')
            train.write(sex)

In [31]:
testDataPerUser = mergeData('sexwordstestdata.txt', 'sextestdata.txt');
with open(directory + 'sexmergetestdata.txt', 'w') as train:
    with open(directory + 'test.txt') as trainingFile:
        for line in trainingFile:
            [user, country, sex] = line.split(':::')
            train.write(user + ',')
            userVector = testDataPerUser[user]
            for feature in userVector:
                train.write(str(feature) + ',')
            train.write(sex)