# Proyecto: Clasificación de tópicos de interés

## Entrega final


CC5113 - Aprendizaje Automático Bayesiano

Profesor: Pablo Guerrero

Autor: Martín Cornejo

## Paquetes necesarios

In [1]:
import numpy as np
import pdb
import itertools
import operator

import pandas as pd

from nltk.stem.snowball import SpanishStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import label_propagation
from sklearn.model_selection import train_test_split

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [2]:
datos=pd.read_csv('data_format.csv')
print(datos.shape)
datos.head()

(185, 2)


Unnamed: 0,Texto,Interes
0,"Ojalá obliguen a Piñera a cerrar Punta Peuco, ...",0
1,Piñera para crear base de apoyo moderada a su ...,1
2,@CNNChile MEMORIA 2014 Adimark: Piñera termina...,1
3,PPK y Piñera en privado habrían conversado alg...,0
4,Bachelet entregará el gobierno de Chile a Piñera,1


## Limpiando strings

### Reemplazar tildes, caracteres especiales, todo a minúsculas

In [3]:
strings = datos.ix[:,0]

def formatear(strings):
    tildes = ['á','é','í','ó','ú']
    vocales = ['a','e','i','o','u']

    # tildes
    for idx, vocal in enumerate(vocales):
        strings = strings.str.replace(tildes[idx],vocal)

    # caracteres especiales menos la ñ
    strings = strings.str.replace('[^a-zñA-Z ]', "")

    # todo a minusculas
    strings = pd.Series(list(map(lambda x: x.lower(), strings)))
    
    return strings

def oracionToStrArr(strings):
    strings_arr = list(map(lambda x: x.split(), strings))
    #pdb.set_trace()
    strings_arr = list(itertools.chain.from_iterable(strings_arr))    
    return strings_arr

print(formatear(strings).head())
formated_array_data = oracionToStrArr(formatear(strings))

0    ojala obliguen a piñera a cerrar punta peuco e...
1    piñera para crear base de apoyo moderada a su ...
2    cnnchile memoria  adimark piñera termina su go...
3    ppk y piñera en privado habrian conversado alg...
4     bachelet entregara el gobierno de chile a piñera
dtype: object


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [4]:
data_limpia = formatear(strings)
data_limpia = pd.concat([data_limpia, datos.ix[:,1]], axis=1)
data_limpia.head()
data_limpia.to_csv("data_limpia.csv", sep='\t')
datos_sin_et = pd.read_csv('data_format_unlabeled.csv')
formatear(datos_sin_et.ix[:,0]).to_csv("data_limpia_unlabeled.csv", sep='\t')

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


## Funciones auxiliares (conteo y ordenar repetidas)

In [5]:
def most_common(str_arr):
  # get an iterable of (item, iterable) pairs
  SL = sorted((x, i) for i, x in enumerate(str_arr))
  list_pairs = []
  #print('SL:', SL)
  groups = itertools.groupby(SL, key=operator.itemgetter(0))
    
  # auxiliary function to get "quality" for an item
  def _auxfun(g):
    item, iterable = g
    count = 0
    min_index = len(str_arr)
    for _, where in iterable:
      count += 1
      min_index = min(min_index, where)
    list_pairs.append((item, count))
    #print('item %r, count %r, minind %r' % (item, count, min_index))
    return count, -min_index

  return max(groups, key=_auxfun)[0], list_pairs

def aplicar_ordenar_str_arr(func, str_arr):
    arr = list(map(func, str_arr))
    common, pairs = most_common(arr)
    pares_filtrados = list(filter(lambda x: len(x[0]) > 3, pairs))
    common_sorted = sorted(pares_filtrados, key=lambda tup: tup[1], reverse=True)
    return common_sorted

## Separando data por clases

In [6]:
datos_interes = datos[datos.Interes == True]
#print(datos_interes.head())
str_interes = datos_interes.ix[:,0]

datos_no_interes = datos[datos.Interes == False]
#print(datos_no_interes.head())
str_no_interes = datos_no_interes.ix[:,0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


## Stemming por clases

In [7]:
stemmer_es = lambda x: SpanishStemmer().stem(x)

stem_interes_ordenado = pd.Series(aplicar_ordenar_str_arr(stemmer_es,oracionToStrArr(formatear(str_interes))))
stem_no_interes_ordenado = pd.Series(aplicar_ordenar_str_arr(stemmer_es,oracionToStrArr(formatear(str_no_interes))))

stem_interes_ordenado

0           (piñer, 95)
1       (president, 31)
2         (sebasti, 26)
3         (gobiern, 25)
4            (chil, 24)
5        (bachelet, 21)
6            (este, 10)
7             (mand, 9)
8            (mañan, 9)
9             (pais, 9)
10            (esta, 8)
11         (reunion, 8)
12            (asum, 7)
13            (años, 7)
14          (doming, 7)
15           (activ, 6)
16           (cambi, 6)
17          (derech, 6)
18           (macri, 6)
19            (nuev, 6)
20            (tien, 6)
21        (asuncion, 5)
22          (chilen, 5)
23           (cierr, 5)
24          (gobern, 5)
25            (lleg, 5)
26           (polit, 5)
27            (pued, 5)
28          (termin, 5)
29            (ahor, 4)
             ...       
544         (todavi, 1)
545           (trag, 1)
546         (tramit, 1)
547    (transparent, 1)
548           (tras, 1)
549      (trasandin, 1)
550         (travaj, 1)
551          (trist, 1)
552           (twet, 1)
553          (twitt, 1)
554        (twit

In [8]:
stem_no_interes_ordenado

0               (piñer, 88)
1                (esta, 11)
2                (ahor, 10)
3                (punt, 10)
4              (sebasti, 9)
5                 (bien, 8)
6                 (peuc, 8)
7             (bachelet, 7)
8                 (cerr, 7)
9                 (cobr, 7)
10           (president, 7)
11                (chil, 6)
12               (cuand, 6)
13                (algo, 5)
14               (culia, 5)
15                (esto, 5)
16               (quier, 5)
17               (cierr, 4)
18                (este, 4)
19             (gobiern, 4)
20              (ladron, 4)
21                (mism, 4)
22                (pier, 4)
23                (tien, 4)
24                (trat, 4)
25             (academi, 3)
26                (buen, 3)
27     (cierrepuntapeuc, 3)
28               (cumpl, 3)
29              (derech, 3)
               ...         
544              (sujet, 1)
545             (tampoc, 1)
546               (tant, 1)
547              (tendr, 1)
548             (ter

## Lematización

In [9]:
def create_lemma_dict(filename):
   with open(filename, 'r') as document:
       lemma_dict = {}
       for line in document:
           if line.strip():  # avoid empty lines
               value, key = line.split(None, 1) # 'None' means 'all whitespace', which is the default
               key = key.rstrip() # rstrip() to get rid of \r and \n
               lemma_dict[key] = value # adding the flections as keys to the dict
               lemma_dict[value] = value # adding also the base word as a key
   return lemma_dict

def query_word(lemma_dict):
   word = input("\nDame una palabra en español -> ")
   try:
      lemma = lemma_dict[word]
      print("__your happy lemma is__: {}".format(lemma))
   except KeyError:
      print("This word is not in the dictionary!")
   return query_word(lemma_dict)

def lemmatiser(dict):
    def lookup(word):
        try:
            lemma = dict[word]
        except:
            lemma = word
        
        return lemma
    
    return lookup

## Lematizando por clases

In [10]:
resource_file = 'lemmatization-es.txt'
lemmatiser_es = lemmatiser(create_lemma_dict(resource_file))

#pdb.set_trace()

datos_interes = datos[datos.Interes == True]
#print(datos_interes.head())
str_interes = datos_interes.ix[:,0]

datos_no_interes = datos[datos.Interes == False]
#print(datos_no_interes.head())
str_no_interes = datos_no_interes.ix[:,0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':


In [11]:
lema_interes_ordenado = pd.Series(aplicar_ordenar_str_arr(lemmatiser_es, oracionToStrArr(formatear(str_interes))))
lema_no_interes_ordenado = pd.Series(aplicar_ordenar_str_arr(lemmatiser_es, oracionToStrArr(formatear(str_no_interes))))

lema_interes_ordenado

0           (piñera, 95)
1       (presidente, 29)
2         (gobierno, 26)
3        (sebastian, 26)
4            (chile, 24)
5             (este, 22)
6         (bachelet, 21)
7            (parir, 18)
8            (comer, 17)
9             (todo, 10)
10            (hacer, 9)
11            (mando, 9)
12           (mañana, 9)
13             (pais, 9)
14            (tener, 9)
15            (poder, 8)
16           (asumir, 7)
17          (domingo, 7)
18            (tomar, 7)
19        (actividad, 6)
20           (cambio, 6)
21           (entrar, 6)
22            (haber, 6)
23            (macri, 6)
24            (nuevo, 6)
25          (reunion, 6)
26             (sera, 6)
27         (asuncion, 5)
28          (chileno, 5)
29            (deber, 5)
             ...        
622        (tramitar, 1)
623    (transparente, 1)
624            (tras, 1)
625      (trasandino, 1)
626         (travajo, 1)
627          (triste, 1)
628          (tweter, 1)
629         (twitter, 1)
630       (twitteros, 1)


In [12]:
lema_no_interes_ordenado

0         (piñera, 88)
1           (este, 21)
2          (ahora, 10)
3          (haber, 10)
4          (parir, 10)
5       (sebastian, 9)
6            (bien, 8)
7           (comer, 8)
8           (peuco, 8)
9            (todo, 8)
10       (bachelet, 7)
11         (cerrar, 7)
12          (decir, 7)
13           (mano, 7)
14          (punta, 7)
15          (chile, 6)
16          (cobre, 6)
17         (cuando, 6)
18          (dejar, 6)
19     (presidente, 6)
20         (querer, 6)
21          (tener, 6)
22           (algo, 5)
23          (estar, 5)
24          (hacer, 5)
25         (culiao, 4)
26          (ganar, 4)
27       (gobierno, 4)
28          (menos, 4)
29          (mismo, 4)
            ...       
642         (total, 1)
643      (trabajar, 1)
644       (traidor, 1)
645        (trampa, 1)
646         (trato, 1)
647       (travajo, 1)
648      (tremendo, 1)
649         (trump, 1)
650       (twitter, 1)
651       (ultimar, 1)
652      (vacancia, 1)
653          (vaya, 1)
654      (v

## Tokenizing usando sklearn

In [13]:
stop_words_es = np.genfromtxt('stop_words_es.txt', dtype='str')
stop_words_es = formatear(pd.Series(stop_words_es))
stop_words_es = list(map(lambda x: x, stop_words_es))

In [14]:
tf_vectorizer = CountVectorizer(min_df=2, max_features=50, stop_words=stop_words_es)
data = formatear(strings)
dtm_tf = tf_vectorizer.fit_transform(data)
#pdb.set_trace()
#True

## Aplicando Latent Dirichlet Allocation

In [15]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        #pdb.set_trace()
        message += " ".join([feature_names[i] + " " + str(int(topic[i])) + " "
                             for i in topic.argsort()[:-n_top_words - 1:-1]])        
        print(message)
    print()

In [16]:
lda = LatentDirichletAllocation(n_components=2, max_iter=100,
                                learning_method='online',
                                learning_offset=10,
                                batch_size=10,
                                random_state=1)

In [17]:
n_top_words = 6
lda.fit(dtm_tf)
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: piñera 104  gobierno 28  bachelet 28  peuco 12  punta 11  años 8 
Topic #1: piñera 80  sebastian 34  presidente 31  chile 29  pais 13  da 11 



In [18]:
pyLDAvis.sklearn.prepare(lda, dtm_tf, tf_vectorizer)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


## Obteniendo las frecuencias

In [19]:
X_train, X_test, y_train, y_test = train_test_split(datos.ix[:,0], datos.ix[:,1], test_size = 0.25, random_state = 4)

tf_vectorizer = CountVectorizer(stop_words=stop_words_es)
dtm_tf = tf_vectorizer.fit_transform(formatear(X_train))
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(dtm_tf)
X_train_tfidf.shape

(138, 1057)

## Entrenando con Naive Bayes

In [20]:
clf = BernoulliNB(fit_prior=False).fit(X_train_tfidf, y_train)

## Predicción del modelo

In [21]:
# new_tweets = ['Viva Piñera', 'Creo que Piñera no tendrá buen desempeño en su nuevo gobierno', 'Piñera y Bachelet lideran gobiernos', 'Piñera y puntapeuco no se rinden']

# Se extraen las características de los nuevos tweets
dtm_tf_test = tf_vectorizer.transform(formatear(X_test))
X_new_tfidf_test = tfidf_transformer.transform(dtm_tf_test)

predicted = clf.predict(X_new_tfidf_test)
#for tweet, prediction in zip(data_test, predicted):
#    print('%r => %s' % (tweet, str(prediction)))

np.mean(predicted == y_test)

0.6595744680851063

## Entrenando con SVM

In [22]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words_es)),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42,
                                            max_iter=5, tol=None))])
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['a', 'aca'...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [23]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.8085106382978723

## Modelo Semi-supervisado

### Agregando 40 datos sin etiquetar

In [47]:
datos_semi=pd.read_csv('data_format_unlabeled.csv')
# X_train_semi, X_test_semi, y_train_semi, y_test_semi = train_test_split(datos_semi.ix[:,0], datos_semi.ix[:,1], test_size = 0.1, random_state = 4)
X_train_s = datos_semi[(datos_semi.Interes == -1)]

X_train_semi = pd.DataFrame(np.concatenate((X_train, X_train_s.ix[:,0]))).ix[:,0]
y_train_semi = np.concatenate((y_train, X_train_s.ix[:,1]))

tf_vectorizer_semi = CountVectorizer(stop_words=stop_words_es)
dtm_tf_semi = tf_vectorizer_semi.fit_transform(formatear(X_train_semi))
tfidf_transformer_semi = TfidfTransformer()
X_train_tfidf_semi = tfidf_transformer_semi.fit_transform(dtm_tf_semi)

dtm_tf_test_semi = tf_vectorizer_semi.transform(formatear(X_test))
X_new_tfidf_test_semi = tfidf_transformer_semi.transform(dtm_tf_test_semi)

text_clf_semi = label_propagation.LabelSpreading()
text_clf_semi.fit(X_train_tfidf_semi.toarray(), y_train_semi)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


LabelSpreading(alpha=0.2, gamma=20, kernel='rbf', max_iter=30, n_jobs=1,
        n_neighbors=7, tol=0.001)

In [48]:
predicted_semi = text_clf_semi.predict(X_new_tfidf_test_semi)
np.mean(predicted_semi == y_test)

0.7446808510638298

### Agregando 80 datos sin etiquetar

In [57]:
datos_semi=pd.read_csv('data_format_unlabeled_extendida.csv')
# X_train_semi, X_test_semi, y_train_semi, y_test_semi = train_test_split(datos_semi.ix[:,0], datos_semi.ix[:,1], test_size = 0.1, random_state = 4)
X_train_s = datos_semi[(datos_semi.Interes == -1)]

X_train_semi = pd.DataFrame(np.concatenate((X_train, X_train_s.ix[:,0]))).ix[:,0]
y_train_semi = np.concatenate((y_train, X_train_s.ix[:,1]))

tf_vectorizer_semi = CountVectorizer(stop_words=stop_words_es)
dtm_tf_semi = tf_vectorizer_semi.fit_transform(formatear(X_train_semi))
tfidf_transformer_semi = TfidfTransformer()
X_train_tfidf_semi = tfidf_transformer_semi.fit_transform(dtm_tf_semi)

dtm_tf_test_semi = tf_vectorizer_semi.transform(formatear(X_test))
X_new_tfidf_test_semi = tfidf_transformer_semi.transform(dtm_tf_test_semi)

text_clf_semi = label_propagation.LabelSpreading()
text_clf_semi.fit(X_train_tfidf_semi.toarray(), y_train_semi)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


LabelSpreading(alpha=0.2, gamma=20, kernel='rbf', max_iter=30, n_jobs=1,
        n_neighbors=7, tol=0.001)

In [58]:
predicted_semi = text_clf_semi.predict(X_new_tfidf_test_semi)
np.mean(predicted_semi == y_test)

0.723404255319149

### Agregando 1 dato sin etiquetar
El motivo de agregar solo 1 dato es en realidad observar como predice el modelo de propagacio de etiquetas solo con los datos etiquetados, para tener una referencia

In [54]:
datos_semi=pd.read_csv('data_format_unlabeled_reducida.csv')
# X_train_semi, X_test_semi, y_train_semi, y_test_semi = train_test_split(datos_semi.ix[:,0], datos_semi.ix[:,1], test_size = 0.1, random_state = 4)
X_train_s = datos_semi[(datos_semi.Interes == -1)]

X_train_semi = pd.DataFrame(np.concatenate((X_train, X_train_s.ix[:,0]))).ix[:,0]
y_train_semi = np.concatenate((y_train, X_train_s.ix[:,1]))

tf_vectorizer_semi = CountVectorizer(stop_words=stop_words_es)
dtm_tf_semi = tf_vectorizer_semi.fit_transform(formatear(X_train_semi))
tfidf_transformer_semi = TfidfTransformer()
X_train_tfidf_semi = tfidf_transformer_semi.fit_transform(dtm_tf_semi)

dtm_tf_test_semi = tf_vectorizer_semi.transform(formatear(X_test))
X_new_tfidf_test_semi = tfidf_transformer_semi.transform(dtm_tf_test_semi)

text_clf_semi = label_propagation.LabelSpreading()
text_clf_semi.fit(X_train_tfidf_semi.toarray(), y_train_semi)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


LabelSpreading(alpha=0.2, gamma=20, kernel='rbf', max_iter=30, n_jobs=1,
        n_neighbors=7, tol=0.001)

In [55]:
predicted_semi = text_clf_semi.predict(X_new_tfidf_test_semi)
np.mean(predicted_semi == y_test)

0.7021276595744681