# TP6 - Procesamiento de Lenguaje Natural

Imports

In [1]:
import pandas as pd
import scipy.io
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style("darkgrid")
import scipy
import itertools
import random
import math
import operator
import re
import xml.etree.ElementTree as ET
from lxml import etree
from collections import Counter
sns.plt = plt

## 1 - Asociacion de palabras
1.1 Levantar el corpus AP, separando cada noticia como un elemento distinto en un diccionario ( < doc_no > : < text > ).

Librerias necesarias: html5lib, lxml, bs4, re

In [19]:
ap_xml_data = open('data/ap.txt').read()

def xml2df(xml_data):
    root = ET.XML(xml_data, parser=etree.XMLParser(recover=True)) # element tree
    all_records = []
    for i, child in enumerate(root):
        record = {}
        for subchild in child:
            if subchild.tag=='DOCNO':
                record[subchild.tag] = subchild.text
            else:
                record[subchild.tag] = re.sub('[^a-zA-Z*]', ' ', subchild.text).lower()
            # Aquella regex elimina todo lo que no son letras.
            # Para aceptar numeros, puntos y guiones: [^a-zA-Z0-9-_*.]
            all_records.append(record)
    return pd.DataFrame(all_records).drop_duplicates().reset_index(drop=True)

ap_df = xml2df(ap_xml_data)

print "AP cargado."
print "Resumen de los primeros 10 documentos:"
print ap_df[0:10]

AP cargado.
Resumen de los primeros 10 documentos:
             DOCNO                                               TEXT
0   AP881218-0003     a    year old student at a private baptist s...
1   AP880224-0195     the bechtel group inc  offered in      to se...
2   AP881017-0144     a gunman took a    year old woman hostage af...
3   AP881017-0219     today is saturday  oct      the    rd day of...
4   AP900117-0022     cupid has a new message for lovers this vale...
5   AP880405-0167     the reagan administration is weighing whethe...
6   AP880825-0239     more than         skins of a protected speci...
7   AP880325-0232     there will be no organized union boost behin...
8   AP880908-0056     here is a summary of developments in forest ...
9   AP881105-0097     jean pierre stirbois  the no    man in the e...


1.2 Calcular el tamano del vocabulario.

In [20]:
freq_vocab = dict()
for index, row in ap_df.iterrows():
    for word in row[1].split():
        freq_vocab[word] = freq_vocab.get(word, 0) + 1

In [21]:
print "El vocabulario tiene: " + str(len(freq_vocab)) + " palabras."

El vocabulario tiene: 33841 palabras.


1.3 Para las 500 palabras con mas apariciones, calcular el par mas asociado segun la medida presentada.

In [22]:
max_freq_vocab = dict(sorted(freq_vocab.iteritems(), key=operator.itemgetter(1), reverse=True)[:500])
print "Se obtuvieron las " + str(len(max_freq_vocab)) + " palabras con mas apariciones."

Se obtuvieron las 500 palabras con mas apariciones.


Sean X,Y palabras, N la cantidad de palabras de todos los textos, W la ventana de co-ocurrencia:

f(X)=occurs(X). f(X,Y)=occurs(Y despues de X, a distancia <= W)/(W-1).

P(X)=f(X)/N. P(X,Y)=f(X,Y)/N.

I(X,Y)=log2(P(X,Y) / (P(X) x P(Y)))=log2( ( f(X,Y) x N ) / ( f(X) x f(Y) ))

In [33]:
def calculate_mutual_association(txts, relevant_words, W):
    N=0
    mutual_assoc = dict()
    for txt in txts:
        words = txt.split()
        N = N + len(words)
        for i in range(len(words)-W):
            word = words[i]
            if word in relevant_words:
                for j in range(1,W):
                    other = words[i+j]
                    mutual_assoc[word] = mutual_assoc.get(word, dict())
                    mutual_assoc[word][other] = mutual_assoc[word].get(other, 0) + 1
    for word, freqs in mutual_assoc.iteritems():
        for other, freq in freqs.iteritems():
            fxy = float(freq / (W-1))
            fx = float(freq_vocab[word])
            fy = float(freq_vocab[other])
            if fxy > 0:
                mutual_assoc[word][other] = math.log((fxy*N)/(fx*fy),2)
            else:
                mutual_assoc[word][other] = 0
    return mutual_assoc

In [34]:
mutual_assoc = calculate_mutual_association(ap_df.iloc[:,1], max_freq_vocab.keys(), 5)

Ejemplo: 10 palabras mas asociadas a "whose"

In [37]:
sorted(mutual_assoc["whose"].iteritems(), key=operator.itemgetter(1), reverse=True)[:10]

[('name', 5.3744053437100066),
 ('includes', 5.050866198916659),
 ('body', 4.708474001469581),
 ('parents', 4.668396562094245),
 ('son', 4.47451682850021),
 ('include', 4.164734163474938),
 ('included', 4.110028271384802),
 ('district', 3.734099809145075),
 ('members', 3.7110161960320336),
 ('found', 3.047649486233116)]

Respuesta: las palabras mas asociadas a cada una de las 500 que mas aparecen:

In [36]:
for word, freqs in mutual_assoc.items():
    print "(" + word + "," + str(max(freqs.iteritems(), key=operator.itemgetter(1))[0]) + ")"

(all,sudden)
(dollar,midmorning)
(month,extension)
(four,networks)
(go,beyond)
(children,adults)
(issues,outnumbered)
(whose,name)
(thursday,zurich)
(to,microcom)
(program,vremya)
(th,anniversary)
(under,auspices)
(must,submit)
(street,gainers)
(outside,womb)
(far,reaching)
(every,palestinian)
(condition,anonymity)
(school,dances)
(did,elaborate)
(companies,considered)
(wednesday,night)
(small,caliber)
(says,leonard)
(leaders,issued)
(past,decade)
(talks,brussels)
(rate,mortgages)
(cost,taxpayers)
(n,y)
(even,though)
(index,arbitrage)
(what,happening)
(business,machines)
(near,border)
(spokesman,gennady)
(capital,gains)
(new,jersey)
(order,until)
(public,defender)
(told,reporters)
(friday,night)
(led,multinational)
(exchange,index)
(men,women)
(here,excerpts)
(hours,minutes)
(reported,editions)
(groups,including)
(vice,president)
(iraq,invaded)
(change,mind)
(employees,buy)
(reports,aoun)
(trial,gesell)
(action,committees)
(military,commanders)
(ago,food)
(family,courts)
(reagan,admini

## 2 -  Informacion Lexica
Bajar de Project Gutenberg el libro de Darwin ON THE ORIGIN OF SPECIES.

2.1 Procesar el texto, tokenizando eliminando signos de puntuacion.

2.2 Siguiendo el artıculo de la seccion, calcular la autocorrelacion para estimar la distribucion de la palabra a lo largo del texto.

2.3 Armar una funcion que reciba una lista de tokens, una lista de palabras y un tamano de ventana y devuelva una lista de probabilidades de encontrar la palabra en cada ventana para cada palabra pasada por parametro.

2.4 Calcular la entropıa de la distribucion de palabras seleccionadas para distintos tamanos de ventana

2.5 Generar una version randomizada del texto, y medir la entropia de las palabras randomizadas.

2.6 Distinguir las palabras del texto en artıculos, sustantivos y adjetivos usando un POS-tagger. Verificar si las medidas separan a estos grupos de palabras.

## 3 - Word embeddings, distancia semantica y Word- Net
3.1 Utilizando el test WordSim3531, comparar el rendimiento entre LSA[3] y Word2Vec2 [4].

3.2 Comparar los distintos word embeddings con las medidas definidas en WordNet.