# Proyecto Final: Clasificación de tópicos de interés

## Exploración de datos


CC5113 - Aprendizaje Automático Bayesiano

Profesor: Pablo Guerrero

Autor: Martín Cornejo

## Paquetes necesarios

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pdb
import itertools
import operator

import pandas as pd
from nltk.stem.snowball import SpanishStemmer

In [2]:
#data = np.genfromtxt('data_format.csv', delimiter='')
datos=pd.read_csv('data_format.csv')
print(datos.shape)
#pdb.set_trace()
datos.head()

(138, 2)


Unnamed: 0,Texto,Interes
0,"Ojalá obliguen a Piñera a cerrar Punta Peuco, ...",False
1,Piñera para crear base de apoyo moderada a su ...,True
2,@CNNChile MEMORIA 2014 Adimark: Piñera termina...,True
3,PPK y Piñera en privado habrían conversado alg...,False
4,Bachelet entregará el gobierno de Chile a Piñera,True


## Limpiando strings

### Reemplazar tildes, caracteres especiales, todo a minúsculas

In [69]:
strings = datos.ix[:,0]

def formatear(strings):
    tildes = ['á','é','í','ó','ú']
    vocales = ['a','e','i','o','u']

    # tildes
    for idx, vocal in enumerate(vocales):
        strings = strings.str.replace(tildes[idx],vocal)

    # caracteres especiales menos la ñ
    strings = strings.str.replace('[^a-zñA-Z ]', "")

    # todo a minusculas
    strings = pd.Series(list(map(lambda x: x.lower(), strings)))
    
    return strings

def oracionToStrArr(strings):
    strings_arr = list(map(lambda x: x.split(), strings))
    strings_arr = list(itertools.chain.from_iterable(strings_arr))    
    return strings_arr

print(formatear(strings).head())
formated_array_data = oracionToStrArr(formatear(strings))

0    ojala obliguen a piñera a cerrar punta peuco e...
1    piñera para crear base de apoyo moderada a su ...
2    cnnchile memoria  adimark piñera termina su go...
3    ppk y piñera en privado habrian conversado alg...
4     bachelet entregara el gobierno de chile a piñera
dtype: object


## Stemming

In [73]:
stemmer = SpanishStemmer()

root_arr = list(map(lambda x: stemmer.stem(x), formated_array_data))

def most_common(str_arr):
  # get an iterable of (item, iterable) pairs
  SL = sorted((x, i) for i, x in enumerate(str_arr))
  list_pairs = []
  #print('SL:', SL)
  groups = itertools.groupby(SL, key=operator.itemgetter(0))
    
  # auxiliary function to get "quality" for an item
  def _auxfun(g):
    item, iterable = g
    count = 0
    min_index = len(str_arr)
    for _, where in iterable:
      count += 1
      min_index = min(min_index, where)
    list_pairs.append((item, count))
    #print('item %r, count %r, minind %r' % (item, count, min_index))
    return count, -min_index

  return max(groups, key=_auxfun)[0], list_pairs

stem_common, list_pairs = most_common(root_arr)

pares_filtrados = list(filter(lambda x: len(x[0]) > 2, list_pairs))

common_roots_sorted = pd.Series(sorted(pares_filtrados, key=lambda tup: tup[1], reverse=True))
common_roots_sorted.head()

0     (piñer, 133)
1        (que, 65)
2        (con, 32)
3    (sebasti, 30)
4        (por, 27)
dtype: object

## Lematización

In [75]:
def create_lemma_dict(filename):
   with open(filename, 'r') as document:
       lemma_dict = {}
       for line in document:
           if line.strip():  # avoid empty lines
               value, key = line.split(None, 1) # 'None' means 'all whitespace', which is the default
               key = key.rstrip() # rstrip() to get rid of \r and \n
               lemma_dict[key] = value # adding the flections as keys to the dict
               lemma_dict[value] = value # adding also the base word as a key
   return lemma_dict

def query_word(lemma_dict):
   word = input("\nDame una palabra en español -> ")
   try:
      lemma = lemma_dict[word]
      print("__your happy lemma is__: {}".format(lemma))
   except KeyError:
      print("This word is not in the dictionary!")
   return query_word(lemma_dict)

def lemmatiser(dict):
    def lookup(word):
        try:
            lemma = dict[word]
        except:
            lemma = word
        
        return lemma
    
    return lookup

resource_file = 'lemmatization-es.txt'
lemmatiser_es = lemmatiser(create_lemma_dict(resource_file))
#pdb.set_trace()

def lematizar_ordenar_str_arr(str_arr):
    lemma_arr = list(map(lemmatiser_es, str_arr))
    common_lemma, pairs_lemma = most_common(lemma_arr)
    pares_filtrados = list(filter(lambda x: len(x[0]) > 3, pairs_lemma))
    common_lemma_sorted = sorted(pares_filtrados, key=lambda tup: tup[1], reverse=True)
    return common_lemma_sorted

pd.Series(lematizar_ordenar_str_arr(formated_array_data)).head()

0       (piñera, 133)
1     (sebastian, 30)
2         (chile, 24)
3          (este, 24)
4    (presidente, 24)
dtype: object

### Lematizando por clases

In [60]:
datos_interes = datos[datos.Interes == True]
#print(datos_interes.head())
str_interes = datos_interes.ix[:,0]

datos_no_interes = datos[datos.Interes == False]
#print(datos_no_interes.head())
str_no_interes = datos_no_interes.ix[:,0]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  import sys


In [80]:
lema_interes_ordenado = pd.Series(lematizar_ordenar_str_arr(oracionToStrArr(formatear(str_interes))))
lema_no_interes_ordenado = pd.Series(lematizar_ordenar_str_arr(oracionToStrArr(formatear(str_no_interes))))

lema_interes_ordenado

0           (piñera, 75)
1       (presidente, 23)
2        (sebastian, 23)
3         (bachelet, 20)
4            (chile, 20)
5             (este, 15)
6         (gobierno, 15)
7            (comer, 13)
8            (parir, 13)
9             (mando, 9)
10        (actividad, 6)
11           (asumir, 6)
12           (cambio, 6)
13          (domingo, 6)
14            (macri, 6)
15           (mañana, 6)
16            (nuevo, 6)
17             (pais, 6)
18          (reunion, 6)
19            (tener, 6)
20             (todo, 6)
21         (asuncion, 5)
22            (haber, 5)
23           (llegar, 5)
24            (poder, 5)
25             (sera, 5)
26            (ahora, 4)
27          (chileno, 4)
28            (decir, 4)
29          (derecho, 4)
             ...        
435      (suramerica, 1)
436         (tambien, 1)
437           (tanto, 1)
438    (tempranisimo, 1)
439          (tendra, 1)
440           (tirar, 1)
441         (titeres, 1)
442         (todavia, 1)
443           (toser, 1)


In [81]:
lema_no_interes_ordenado

0          (piñera, 58)
1             (este, 9)
2        (sebastian, 7)
3             (bien, 6)
4            (comer, 6)
5           (cuando, 6)
6            (haber, 6)
7            (peuco, 6)
8           (cerrar, 5)
9            (punta, 5)
10           (ahora, 4)
11           (chile, 4)
12           (ganar, 4)
13           (parir, 4)
14          (querer, 4)
15        (academia, 3)
16            (algo, 3)
17        (bachelet, 3)
18          (cierre, 3)
19           (decir, 3)
20           (dejar, 3)
21     (diplomatica, 3)
22           (estar, 3)
23          (llegar, 3)
24           (penal, 3)
25           (piera, 3)
26           (tener, 3)
27           (votar, 3)
28          (abogar, 2)
29           (abrir, 2)
             ...       
346        (tampoco, 1)
347          (tanto, 1)
348           (tele, 1)
349         (tiempo, 1)
350          (tirar, 1)
351          (tonda, 1)
352          (toque, 1)
353          (total, 1)
354       (trabajar, 1)
355        (traidor, 1)
356         (tra