# Proyecto: Clasificación de tópicos de interés

MA5203 - Aprendizaje de Máquinas Probabilístico

Profesor: Felipe Tobar

Autor: Martín Cornejo

## Paquetes necesarios

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pdb
import itertools
import operator

import pandas as pd

from nltk.stem.snowball import SpanishStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import label_propagation
from sklearn.model_selection import train_test_split

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [2]:
datos=pd.read_csv('data_format.csv')
print(datos.shape)
datos.head()

(234, 2)


Unnamed: 0,Texto,Interes
0,"Ojalá obliguen a Piñera a cerrar Punta Peuco, ...",0
1,Piñera para crear base de apoyo moderada a su ...,1
2,@CNNChile MEMORIA 2014 Adimark: Piñera termina...,1
3,PPK y Piñera en privado habrían conversado alg...,0
4,Bachelet entregará el gobierno de Chile a Piñera,1


## Limpiando strings

### Reemplazar tildes, caracteres especiales, todo a minúsculas

In [3]:
strings = datos.ix[:,0]

def formatear(strings):
    tildes = ['á','é','í','ó','ú']
    vocales = ['a','e','i','o','u']

    # tildes
    for idx, vocal in enumerate(vocales):
        strings = strings.str.replace(tildes[idx],vocal)

    # caracteres especiales menos la ñ
    strings = strings.str.replace('[^a-zñA-Z ]', "")

    # todo a minusculas
    strings = pd.Series(list(map(lambda x: x.lower(), strings)))
    
    return strings

def oracionToStrArr(strings):
    strings_arr = list(map(lambda x: x.split(), strings))
    #pdb.set_trace()
    strings_arr = list(itertools.chain.from_iterable(strings_arr))    
    return strings_arr

print(formatear(strings).head())
formated_array_data = oracionToStrArr(formatear(strings))

0    ojala obliguen a piñera a cerrar punta peuco e...
1    piñera para crear base de apoyo moderada a su ...
2    cnnchile memoria  adimark piñera termina su go...
3    ppk y piñera en privado habrian conversado alg...
4     bachelet entregara el gobierno de chile a piñera
dtype: object


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


## Exportando data limpia para ser procesada por Stanford TMT

In [4]:
data_train, data_test = train_test_split(datos, test_size = 0.2, random_state = 4)

# pdb.set_trace()

data_limpia = formatear(data_train.ix[:,0])
data_limpia = pd.concat([data_limpia, pd.DataFrame(np.array(data_train.ix[:,1]))], axis=1)
data_limpia.head()
data_limpia.to_csv("data_limpia_train.csv", sep=';')


data_limpia_test = formatear(data_test.ix[:,0])
data_limpia_test = pd.concat([data_limpia_test, pd.DataFrame(np.array(data_test.ix[:,1]))], axis=1)
data_limpia_test.to_csv("data_limpia_test.csv", sep=';')

X_train = data_train.ix[:,0]
y_train = np.array(data_train.ix[:,1])

X_test = data_test.ix[:,0]
y_test = np.array(data_test.ix[:,1])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  # This is added back by InteractiveShellApp.init_path()


## Funciones auxiliares (conteo y ordenar repetidas)

In [5]:
def most_common(str_arr):
  # get an iterable of (item, iterable) pairs
  SL = sorted((x, i) for i, x in enumerate(str_arr))
  list_pairs = []
  #print('SL:', SL)
  groups = itertools.groupby(SL, key=operator.itemgetter(0))
    
  # auxiliary function to get "quality" for an item
  def _auxfun(g):
    item, iterable = g
    count = 0
    min_index = len(str_arr)
    for _, where in iterable:
      count += 1
      min_index = min(min_index, where)
    list_pairs.append((item, count))
    #print('item %r, count %r, minind %r' % (item, count, min_index))
    return count, -min_index

  return max(groups, key=_auxfun)[0], list_pairs

def aplicar_ordenar_str_arr(func, str_arr):
    arr = list(map(func, str_arr))
    common, pairs = most_common(arr)
    pares_filtrados = list(filter(lambda x: len(x[0]) > 3, pairs))
    common_sorted = sorted(pares_filtrados, key=lambda tup: tup[1], reverse=True)
    return common_sorted

## Separando data por clases

In [6]:
datos_interes = datos[datos.Interes == True]
#print(datos_interes.head())
str_interes = datos_interes.ix[:,0]

datos_no_interes = datos[datos.Interes == False]
#print(datos_no_interes.head())
str_no_interes = datos_no_interes.ix[:,0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


## Stemming por clases

In [7]:
stemmer_es = lambda x: SpanishStemmer().stem(x)

stem_interes_ordenado = pd.Series(aplicar_ordenar_str_arr(stemmer_es,oracionToStrArr(formatear(str_interes))))
stem_no_interes_ordenado = pd.Series(aplicar_ordenar_str_arr(stemmer_es,oracionToStrArr(formatear(str_no_interes))))

stem_interes_ordenado

0          (piñer, 116)
1         (gobiern, 36)
2       (president, 36)
3            (chil, 28)
4        (bachelet, 27)
5         (sebasti, 26)
6            (esta, 13)
7            (pais, 12)
8            (asum, 11)
9            (mand, 11)
10           (años, 10)
11           (este, 10)
12          (mañan, 10)
13            (pued, 9)
14            (nuev, 8)
15         (reunion, 8)
16           (activ, 7)
17           (cambi, 7)
18          (derech, 7)
19          (doming, 7)
20          (gobern, 7)
21           (polit, 7)
22          (termin, 7)
23            (tien, 7)
24          (entreg, 6)
25           (macri, 6)
26           (mejor, 6)
27          (nuestr, 6)
28            (peuc, 6)
29            (punt, 6)
             ...       
725         (travaj, 1)
726        (tremend, 1)
727          (trist, 1)
728           (twet, 1)
729          (twitt, 1)
730        (twitter, 1)
731           (udse, 1)
732           (unas, 1)
733           (unic, 1)
734           (unos, 1)
735         (unt

In [8]:
stem_no_interes_ordenado

0          (piñer, 115)
1            (esta, 19)
2       (president, 14)
3            (ahor, 13)
4            (chil, 11)
5         (sebasti, 11)
6            (bien, 10)
7            (punt, 10)
8          (gobiern, 8)
9             (peuc, 8)
10            (tien, 8)
11        (bachelet, 7)
12            (cerr, 7)
13            (cobr, 7)
14        (comision, 7)
15           (cuand, 7)
16        (izquierd, 7)
17          (trabaj, 7)
18           (culia, 6)
19          (derech, 6)
20            (esto, 6)
21            (mism, 6)
22           (quier, 6)
23            (algo, 5)
24            (busc, 5)
25            (este, 5)
26            (falt, 5)
27            (gent, 5)
28           (grand, 5)
29          (nuestr, 5)
             ...       
777    (tramitacion, 1)
778          (tramp, 1)
779           (trav, 1)
780         (travaj, 1)
781        (tremend, 1)
782          (trump, 1)
783           (unic, 1)
784           (unid, 1)
785        (urresti, 1)
786          (usted, 1)
787        (vaca

## Lematización

In [9]:
def create_lemma_dict(filename):
   with open(filename, 'r') as document:
       lemma_dict = {}
       for line in document:
           if line.strip():  # avoid empty lines
               value, key = line.split(None, 1) # 'None' means 'all whitespace', which is the default
               key = key.rstrip() # rstrip() to get rid of \r and \n
               lemma_dict[key] = value # adding the flections as keys to the dict
               lemma_dict[value] = value # adding also the base word as a key
   return lemma_dict

def query_word(lemma_dict):
   word = input("\nDame una palabra en español -> ")
   try:
      lemma = lemma_dict[word]
      print("__your happy lemma is__: {}".format(lemma))
   except KeyError:
      print("This word is not in the dictionary!")
   return query_word(lemma_dict)

def lemmatiser(dict):
    def lookup(word):
        try:
            lemma = dict[word]
        except:
            lemma = word
        
        return lemma
    
    return lookup

## Lematizando por clases

In [10]:
resource_file = 'lemmatization-es.txt'
lemmatiser_es = lemmatiser(create_lemma_dict(resource_file))

#pdb.set_trace()

datos_interes = datos[datos.Interes == True]
#print(datos_interes.head())
str_interes = datos_interes.ix[:,0]

datos_no_interes = datos[datos.Interes == False]
#print(datos_no_interes.head())
str_no_interes = datos_no_interes.ix[:,0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if sys.path[0] == '':


In [11]:
lema_interes_ordenado = pd.Series(aplicar_ordenar_str_arr(lemmatiser_es, oracionToStrArr(formatear(str_interes))))
lema_no_interes_ordenado = pd.Series(aplicar_ordenar_str_arr(lemmatiser_es, oracionToStrArr(formatear(str_no_interes))))

lema_interes_ordenado

0          (piñera, 116)
1         (gobierno, 36)
2       (presidente, 33)
3            (chile, 28)
4             (este, 28)
5         (bachelet, 27)
6        (sebastian, 26)
7            (parir, 25)
8            (comer, 20)
9             (todo, 14)
10           (haber, 12)
11            (pais, 12)
12           (poder, 12)
13          (asumir, 11)
14           (tener, 11)
15           (hacer, 10)
16          (mañana, 10)
17            (sera, 10)
18            (mando, 9)
19             (pero, 9)
20           (cambio, 7)
21            (decir, 7)
22          (domingo, 7)
23           (entrar, 7)
24            (estar, 7)
25            (nuevo, 7)
26         (terminar, 7)
27            (tomar, 7)
28        (actividad, 6)
29            (deber, 6)
             ...        
830          (tweter, 1)
831         (twitter, 1)
832       (twitteros, 1)
833            (udse, 1)
834           (unico, 1)
835         (untongo, 1)
836         (urgente, 1)
837            (urna, 1)
838           (usach, 1)


In [12]:
lema_no_interes_ordenado

0         (piñera, 115)
1            (este, 32)
2           (parir, 23)
3           (comer, 15)
4           (haber, 15)
5           (ahora, 13)
6            (todo, 13)
7      (presidente, 12)
8           (chile, 11)
9       (sebastian, 11)
10          (tener, 11)
11           (bien, 10)
12           (hacer, 9)
13           (dejar, 8)
14        (gobierno, 8)
15           (peuco, 8)
16          (querer, 8)
17        (bachelet, 7)
18          (cerrar, 7)
19          (cuando, 7)
20           (decir, 7)
21            (mano, 7)
22           (punta, 7)
23           (votar, 7)
24           (cobre, 6)
25           (estar, 6)
26       (izquierdo, 6)
27           (mismo, 6)
28            (pero, 6)
29            (algo, 5)
             ...       
913         (traves, 1)
914       (tremendo, 1)
915          (trump, 1)
916          (unico, 1)
917           (unir, 1)
918        (urresti, 1)
919       (vacancia, 1)
920           (vaya, 1)
921       (ventilar, 1)
922       (viaticos, 1)
923           (v

## Tokenizing usando sklearn

In [13]:
stop_words_es = np.genfromtxt('stop_words_es.txt', dtype='str')
stop_words_es = formatear(pd.Series(stop_words_es))
stop_words_es = list(map(lambda x: x, stop_words_es))

In [14]:
tf_vectorizer = CountVectorizer(min_df=2, max_features=50, stop_words=stop_words_es)
data = formatear(strings)
dtm_tf = tf_vectorizer.fit_transform(data)
#pdb.set_trace()
#True

## Aplicando Latent Dirichlet Allocation

In [15]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        #pdb.set_trace()
        message += " ".join([feature_names[i] + " " + str(int(topic[i])) + " "
                             for i in topic.argsort()[:-n_top_words - 1:-1]])        
        print(message)
    print()

In [16]:
lda = LatentDirichletAllocation(n_components=2, max_iter=100,
                                learning_method='online',
                                learning_offset=10,
                                batch_size=10,
                                random_state=1)

In [17]:
n_top_words = 6
lda.fit(dtm_tf)
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: piñera 97  chile 38  presidente 37  sebastian 35  pais 17  bachelet 14 
Topic #1: piñera 134  gobierno 44  bachelet 22  todos 13  hay 11  izquierda 11 



In [18]:
pyLDAvis.sklearn.prepare(lda, dtm_tf, tf_vectorizer)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


## Obteniendo las frecuencias

In [19]:
tf_vectorizer = CountVectorizer(stop_words=stop_words_es)
dtm_tf = tf_vectorizer.fit_transform(formatear(X_train))
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(dtm_tf)
X_train_tfidf.shape

(187, 1525)

## Entrenando con Naive Bayes

In [20]:
clf = BernoulliNB(fit_prior=False).fit(X_train_tfidf, y_train)

## Predicción del modelo

In [21]:
# Se extraen las características de los nuevos tweets
dtm_tf_test = tf_vectorizer.transform(formatear(X_test))
X_new_tfidf_test = tfidf_transformer.transform(dtm_tf_test)

predicted = clf.predict(X_new_tfidf_test)
#for tweet, prediction in zip(data_test, predicted):
#    print('%r => %s' % (tweet, str(prediction)))

np.mean(predicted == y_test)

0.7446808510638298

## Entrenando con SVM

In [22]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=stop_words_es)),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42,
                                            max_iter=5, tol=None))])
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['a', 'aca'...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [23]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.723404255319149

## Resultados de Labeled LDA (Stanford Topic Modeling Toolbox)

In [56]:
data_test = pd.read_csv('data_limpia_test.csv')
inferencia = pd.read_csv('data_infered.csv')

inferencia = inferencia.ix[:,1]
inferencia = list(map(lambda x: 1 if x >=0.5 else 0, inferencia))

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.


In [57]:
correct_cases = 0

for idx, _ in enumerate(inferencia):
    correct_cases += 1 if (inferencia[idx] == data_test.ix[:,2][idx]) else 0
        
print('El accuracy es {0:.2f}'.format(correct_cases/len(inferencia)))

El accuracy es 0.66


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.
