In [1]:
import nltk
import sys
import re
import numpy
import pandas
from nltk.util import ngrams
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import QuadgramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.metrics import TrigramAssocMeasures
from nltk.metrics import QuadgramAssocMeasures
from nltk.sentiment.util import extract_unigram_feats
from nltk.sentiment.util import extract_bigram_feats
from nltk.corpus import stopwords
from pysentimiento.preprocessing import preprocess_tweet
from pysentimiento import analyzer
import json
import networkx as nx
import community as community_louvain
import networkx.algorithms.community.kernighan_lin as community_alg
import matplotlib.cm as cm
import matplotlib.pyplot as plt

In [2]:
def preprocess(doc):
    pdoc = preprocess_tweet(doc, shorten=2) #standarize retweets,urls,responses,laughs,emojis,etc
    pdoc = pdoc.lower() #lowercase
    pdoc = re.sub(r'[^\w\s]', '', pdoc) #remove punctuation
    pdoc = ' '.join([w for w in pdoc.split() if w not in stopwords])
    return pdoc

def remove_duplicates(x):
    return list(dict.fromkeys(x))

def get_ngrams(words, top_n, min_freq, option, measure):
    if (option==1):
        fdist = nltk.FreqDist(words)
        unigrams = []
        for (w,d) in fdist.most_common(top_n):
            unigrams.append(w)
        print("top "+str(top_n)+" unigrams:")
        print(unigrams)
        return unigrams
    if (option==2):
        bcf = BigramCollocationFinder.from_words(iter(words))
        bcf.apply_freq_filter(min_freq)
        bigrams = []
        if (measure=="pmi"):
            print ("Using pmi assoc. measure...")
            bigrams = bcf.nbest(BigramAssocMeasures.pmi, top_n)
        if (measure=="chi"):
            print ("Using chi-squared measure...")
            bigrams = bcf.nbest(BigramAssocMeasures.chi_sq, top_n)
        if (measure=="lr"):
            print ("Using likelihood ratio assoc. measure...")
            bigrams = bcf.nbest(BigramAssocMeasures.likelihood_ratio, top_n)
        if (measure=="ps"):
            print ("Using poisson stirling assoc. measure...")
            bigrams = bcf.nbest(BigramAssocMeasures.poisson_stirling, top_n)
        if (measure=="j"):
            print ("Using jaccard assoc. measure...")
            bigrams = bcf.nbest(BigramAssocMeasures.jaccard, top_n)
        if (measure=="st"):
            print ("Using student_t assoc. measure...")
            bigrams = bcf.nbest(BigramAssocMeasures.student_t, top_n)
        print("top "+str(top_n)+" bigrams:")
        print(bigrams)
        return bigrams
    if (option==3):
        tcf = TrigramCollocationFinder.from_words(iter(words))
        tcf.apply_freq_filter(min_freq)
        trigrams = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, top_n)
        print("top "+str(top_n)+" trigrams:")
        print(trigrams)
        return trigrams
    if (option==4):
        qcf = QuadgramCollocationFinder.from_words(iter(words))
        qcf.apply_freq_filter(min_freq)
        quadgrams = qcf.nbest(QuadgramAssocMeasures.likelihood_ratio, top_n)
        print("top "+str(top_n)+" quadgrams:")
        print(quadgrams)
        return quadgrams

def extract_ngram_feats(document, ngramss, option):
    features = {}
    for ngram in ngramss:
        if option==1:
            features['contains({0})'.format(ngram[0])] = ngram in document
        if option==2:
            features['contains({0} - {1})'.format(ngram[0], ngram[1])] = ngram in nltk.bigrams(document)
        if option==3:
            features['contains({0} - {1} - {2})'.format(ngram[0], ngram[1], ngram[2])] = ngram in nltk.trigrams(document)
        if option==4:
            features['contains({0} - {1} - {2} - {3})'.format(ngram[0], ngram[1], ngram[2], ngram[3])] = ngram in ngrams(document, 4)
    return features

def create_graph(words, option, top_n,measure):
    # Extract ngram features
    ngramss,matrix = create_matrix(words,option, top_n,measure)
    G = nx.Graph()
    for d in range(len(documents)):
        G.add_node(d,tweet=documents[d], ptweet=documents_processed[d])
    for d1 in range(len(documents)):
        for d2 in range(len(documents)):
            for ngram in ngramss:
                feat_key = ""
                if option==1:
                    feat_key = "contains("+ngram[0]+")"
                if option==2:
                    feat_key = "contains("+ngram[0]+" - "+ngram[1]+")"
                if option==3:
                    feat_key = "contains("+ngram[0]+" - "+ngram[1]+" - "+ngram[2]+")"
                if option==4:
                    feat_key = "contains("+ngram[0]+" - "+ngram[1]+" - "+ngram[2]+" - "+ngram[3]+")"
                if (matrix[d1][feat_key] and matrix[d2][feat_key]) and (d1!=d2):
                    oldEdge = (d1,d2) if (d1,d2) in G.edges else None
                    if oldEdge==None:
                        G.add_edge(d1,d2,weight=1)
                    else:
                        G.adj[d1][d2]['weight'] += 1
    return G

def create_matrix(words,option,top_n,measure):
    ngramss = get_ngrams(words,top_n,1,option,measure)
    matrix = []
    for d in documents_processed:
        matrix.append(extract_ngram_feats(d.split(),ngramss,option))
    return ngramss,matrix;


def draw_graph(G,partition):
    pos = nx.spring_layout(G)
    #color the nodes according to their partition
    cmap = cm.get_cmap('viridis', max(partition.values()) + 1)
    nx.draw_networkx_nodes(G, pos, partition.keys(), node_size=10,cmap=cmap, node_color=list(partition.values()))
    nx.draw_networkx_edges(G, pos, alpha=0.5)
    nx.write_gexf(G, "test.gexf")
    plt.show()

Abrimos el dataset de tweets, eliminamos tweets duplicados, esto nos deja 1607 tweets.

In [3]:
f = open('tweets.json',)
data = json.load(f)
documents = remove_duplicates([d['text'] for d in data])
f.close()

Adaptamos los stopwords tradicionales para que no incluyan palabras que puedan ayudar a detectar sentimiento, y que a su vez incluyan tags innecesarios.

Luego preprocesamos los tweets, esto implica: 
 *estandarizar retweets, urls, respuestas, risas y emojis
 *pasar todo a minuscula
 *eliminar signos de puntuación
 *eliminar stopwords

In [4]:
exceptions = ['no','pero','muy','sin','y','ni','contra','tanto','ellas','nosotras','nuestra','nuestras']
stopwords = [x for x in stopwords.words('spanish') if x not in exceptions]
stopwords.append('rt')
stopwords.append('usuario')
stopwords.append('url')
stopwords.append('re')
stopwords.append('emoji')
documents_processed = [preprocess(d) for d in documents]

"Ejemplo de tweet original:"+ documents[0]
"Ejemplo de tweet procesado:"+documents_processed[0]
"Cantidad de tweets:"+ str(len (documents))
"Cantidad de tweets (procesados):"+ str(len (documents_processed))

'Cantidad de tweets (procesados):1607'

Separamos tweets a favor y en contra (los primeros 785 tweets están etiquetados en contra, y el resto a favor). Esto nos sirve para luego validar los resultados.

In [5]:
tweets_en_contra = documents[:785]
tweets_a_favor = documents[785:]

Tokenizamos todos los tweets

In [6]:
doc = ' '.join(documents_processed)
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(doc)

Creamos un grafo donde los nodos son los tweets y los arcos representan n-gramas en común. Mientras más haya en común el arco se hace linearmente mas pesado. Aquí hay tres argumentos importantes:
* n
* número de n-gramas a considerar como features de acuerdo a la métrica de asociación
* metrica de asociación

In [7]:
G = create_graph(words,2,20,"st") #1:unigrams,#2:bigrams,3:trigrams
"Cantidad de nodos:" + str(G.number_of_nodes())
"Cantidad de arcos:" + str(G.number_of_edges())

Using student_t assoc. measure...
top 20 bigrams:
[('aborto', 'legal'), ('no', 'aborto'), ('ala', 'vida'), ('si', 'ala'), ('vida', 'no'), ('banco', 'avilma'), ('salvemos', 'las2'), ('las2', 'vidas'), ('aborto', 'si'), ('y', 'gratuito'), ('corazón', 'verde'), ('sia', 'vida'), ('noal', 'aborto'), ('dorso', 'mano'), ('mano', 'índice'), ('educación', 'sexual'), ('hacia', 'abajo'), ('índice', 'hacia'), ('legal', 'seguro'), ('tono', 'piel')]
ngramss:20
dict_keys(['contains(aborto - legal)'])
dict_keys(['contains(aborto - legal)'])


KeyError: 'contains(no - aborto)'

Realizamos una partición del grafo utilizando el algoritmo de bisección de kernighan. Esto nos parte el grafo en exactamente 2 conjuntos.
Luego evaluamos la precisión de cada partición respecto a los tweets que sabemos que estan a favor y en contra respectivamente.

In [None]:
partition = community_alg.kernighan_lin_bisection(G,max_iter=100,weight='weight')

en_contra_p0 = 0
a_favor_p0 = 0
en_contra_p1 = 0
a_favor_p1= 0

for d in partition[0]:
    if (documents[d] in tweets_en_contra):
        en_contra_p0 += 1
    if (documents[d] in tweets_a_favor):
        a_favor_p0 += 1
for d in partition[1]:
    if (documents[d] in tweets_en_contra):
        en_contra_p1 += 1
    if (documents[d] in tweets_a_favor):
        a_favor_p1 += 1
    
print("particion 0:")
print("  tweets a favor:" + str(a_favor_p0) + "("+str((a_favor_p0/(a_favor_p0+en_contra_p0))*100)+"%)")
print("  tweets en contra:" + str(en_contra_p0) + "("+str((en_contra_p0/(a_favor_p0+en_contra_p0))*100)+"%)")
print("particion 1:")
print("  tweets a favor:" + str(a_favor_p1) + "("+str((a_favor_p1/(a_favor_p1+en_contra_p1))*100)+"%)")
print("  tweets en contra:" + str(en_contra_p1) + "("+str((en_contra_p1/(a_favor_p1+en_contra_p1))*100)+"%)")