# Tema 2: Recursos lingüísticos

## Ejercicio 1: Análisis con WordNet

In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from pprint import pprint
from tqdm import tqdm

### Apartado 1.1: Synsets y relaciones en WordNet

a) Sentidos de "fight"

In [2]:
# Obtener los diferentes sentidos de la palabra "fight" con definición, lemas y ejemplos
for synset in wn.synsets('fight'):
    print("Synset:", synset.name())
    print("Def:", synset.definition())
    print("Lemmas:", [lemma for lemma in synset.lemma_names()])
    print("Examples:", synset.examples())
    print()

Synset: battle.n.01
Def: a hostile meeting of opposing military forces in the course of a war
Lemmas: ['battle', 'conflict', 'fight', 'engagement']
Examples: ['Grant won a decisive victory in the battle of Chickamauga', 'he lost his romantic ideas about war when he got into a real engagement']

Synset: fight.n.02
Def: the act of fighting; any contest or struggle
Lemmas: ['fight', 'fighting', 'combat', 'scrap']
Examples: ['a fight broke out at the hockey game', 'there was fighting in the streets', 'the unhappy couple got into a terrible scrap']

Synset: competitiveness.n.01
Def: an aggressive willingness to compete
Lemmas: ['competitiveness', 'fight']
Examples: ['the team was full of fight']

Synset: fight.n.04
Def: an intense verbal dispute
Lemmas: ['fight']
Examples: ['a violent fight over the bill is expected in the Senate']

Synset: fight.n.05
Def: a boxing or wrestling match
Lemmas: ['fight']
Examples: ['the fight was on television last night']

Synset: contend.v.06
Def: be engaged

b) Synsets de "fight" como verbo

In [3]:
synsets = wn.synsets('fight', 'v')

for synset in synsets:
    print("Def:", synset.definition())
    print("Lemmas:", [lemma for lemma in synset.lemma_names()])
    print("Examples:", synset.examples())
    print()

Def: be engaged in a fight; carry on a fight
Lemmas: ['contend', 'fight', 'struggle']
Examples: ['the tribesmen fought each other', 'Siblings are always fighting', 'Militant groups are contending for control of the country']

Def: fight against or resist strongly
Lemmas: ['fight', 'oppose', 'fight_back', 'fight_down', 'defend']
Examples: ['The senator said he would oppose the bill', "Don't fight it!"]

Def: make a strenuous or labored effort
Lemmas: ['fight', 'struggle']
Examples: ['She struggled for years to survive without welfare', 'He fought for breath']

Def: exert oneself continuously, vigorously, or obtrusively to gain an end or engage in a crusade for a certain cause or person; be an advocate for
Lemmas: ['crusade', 'fight', 'press', 'campaign', 'push', 'agitate']
Examples: ['The liberal party pushed for reforms', "She is crusading for women's rights", 'The Dean is pushing for his favorite candidate']



c) Sentidos de "bank" como nombre

In [4]:
synsets = wn.synsets('bank', 'n')

print(f"'bank' tiene {len(synsets)} sentidos como nombre:\n")
for synset in synsets:
    print("Def:", synset.definition())
    print("Lemmas:", [lemma for lemma in synset.lemma_names()])
    print("Examples:", synset.examples())
    print()

'bank' tiene 10 sentidos como nombre:

Def: sloping land (especially the slope beside a body of water)
Lemmas: ['bank']
Examples: ['they pulled the canoe up on the bank', 'he sat on the bank of the river and watched the currents']

Def: a financial institution that accepts deposits and channels the money into lending activities
Lemmas: ['depository_financial_institution', 'bank', 'banking_concern', 'banking_company']
Examples: ['he cashed a check at the bank', 'that bank holds the mortgage on my home']

Def: a long ridge or pile
Lemmas: ['bank']
Examples: ['a huge bank of earth']

Def: an arrangement of similar objects in a row or in tiers
Lemmas: ['bank']
Examples: ['he operated a bank of switches']

Def: a supply or stock held in reserve for future use (especially in emergencies)
Lemmas: ['bank']
Examples: []

Def: the funds held by a gambling house or the dealer in some gambling games
Lemmas: ['bank']
Examples: ['he tried to break the bank at Monte Carlo']

Def: a slope in the turn 

In [5]:
# Buscar hiperónimos de "bank"
def get_hypernyms(word, pos):
    hypernyms = set()
    for syn in wn.synsets(word, pos):
        for hypernym in syn.hypernyms():
            for lemma in hypernym.lemmas():
                hypernyms.add(lemma.name())
    return hypernyms

print("Hiperónimos de 'bank':")
pprint(get_hypernyms('bank', 'n'))

Hiperónimos de 'bank':
{'airplane_maneuver',
 'array',
 'backlog',
 'cash_in_hand',
 'container',
 'deposit',
 'depositary',
 'depository',
 'finances',
 'financial_institution',
 'financial_organisation',
 'financial_organization',
 'flight_maneuver',
 'funds',
 'incline',
 'monetary_resource',
 'pecuniary_resource',
 'repository',
 'reserve',
 'ridge',
 'side',
 'slope',
 'stockpile'}


In [6]:
# Buscar hipónimos de "bank"
def get_hyponyms(word, pos):
    hyponyms = set()
    for syn in wn.synsets(word, pos):
        for hyponym in syn.hyponyms():
            for lemma in hyponym.lemmas():
                hyponyms.add(lemma.name())
    return hyponyms

print("Hipónimos de 'bank':")
pprint(get_hyponyms('bank', 'n'))

Hipónimos de 'bank':
{'Federal_Reserve_Bank',
 'Home_Loan_Bank',
 'acquirer',
 'agent_bank',
 'blood_bank',
 'bluff',
 'commercial_bank',
 'credit_union',
 'eye_bank',
 'food_bank',
 'full_service_bank',
 'lead_bank',
 'member_bank',
 'merchant_bank',
 'penny_bank',
 'piggy_bank',
 'reserve_bank',
 'riverbank',
 'riverside',
 'sandbank',
 'soil_bank',
 'state_bank',
 'thrift_institution',
 'vertical_bank',
 'waterside'}


### Apartado 1.2: Similitud semántica y desambiguación

a) Similitud semántica entre synsets

In [None]:
# Buscar el sentido más razonable para cada palabra
for syn in wn.synsets('dog', 'n'):
    print(syn.name(), syn.definition())

print()

for syn in wn.synsets('cat', 'n'):
    print(syn.name(), syn.definition())

print()

for syn in wn.synsets('car', 'n'):
    print(syn.name(), syn.definition())

print()

for syn in wn.synsets('eagle', 'n'):
    print(syn.name(), syn.definition())

print()

for syn in wn.synsets('whale', 'n'):
    print(syn.name(), syn.definition())

print()

for syn in wn.synsets('vehicle', 'n'):
    print(syn.name(), syn.definition())

dog.n.01 a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds
frump.n.01 a dull unattractive unpleasant girl or woman
dog.n.03 informal term for a man
cad.n.01 someone who is morally reprehensible
frank.n.02 a smooth-textured sausage of minced beef or pork usually smoked; often served on a bread roll
pawl.n.01 a hinged catch that fits into a notch of a ratchet to move a wheel forward or prevent it from moving backward
andiron.n.01 metal supports for logs in a fireplace

cat.n.01 feline mammal usually having thick soft fur and no ability to roar: domestic cats; wildcats
guy.n.01 an informal term for a youth or man
cat.n.03 a spiteful woman gossip
kat.n.01 the leaves of the shrub Catha edulis which are chewed like tobacco or used to make tea; has the effect of a euphoric stimulant
cat-o'-nine-tails.n.01 a whip with nine knotted cords
caterpillar.n.02 a large tracked vehicle that is propelled

In [8]:
dog_synset = wn.synset('dog.n.01')
cat_synset = wn.synset('cat.n.01')
car_synset = wn.synset('car.n.01')
eagle_synset = wn.synset('eagle.n.01')
whale_synset = wn.synset('whale.n.02')
vehicle_synset = wn.synset('vehicle.n.01')

In [9]:
# Medidas basadas en path (camino más corto)
print("Path Similarity:")
print(f"  dog - cat: {dog_synset.path_similarity(cat_synset):.4f}")
print(f"  vehicle - car: {vehicle_synset.path_similarity(car_synset):.4f}")
print(f"  dog - car: {dog_synset.path_similarity(car_synset):.4f}")
print(f"  whale - eagle: {whale_synset.path_similarity(eagle_synset):.4f}")

print("\nLeacock-Chodorow Similarity:")
print(f"  dog - cat: {dog_synset.lch_similarity(cat_synset):.4f}")
print(f"  vehicle - car: {vehicle_synset.lch_similarity(car_synset):.4f}")
print(f"  dog - car: {dog_synset.lch_similarity(car_synset):.4f}")
print(f"  whale - eagle: {whale_synset.lch_similarity(eagle_synset):.4f}")

print("\nWu-Palmer Similarity:")
print(f"  dog - cat: {dog_synset.wup_similarity(cat_synset):.4f}")
print(f"  vehicle - car: {vehicle_synset.wup_similarity(car_synset):.4f}")
print(f"  dog - car: {dog_synset.wup_similarity(car_synset):.4f}")
print(f"  whale - eagle: {whale_synset.wup_similarity(eagle_synset):.4f}")

Path Similarity:
  dog - cat: 0.2000
  vehicle - car: 0.2000
  dog - car: 0.0769
  whale - eagle: 0.1111

Leacock-Chodorow Similarity:
  dog - cat: 2.0281
  vehicle - car: 2.0281
  dog - car: 1.0726
  whale - eagle: 1.4404

Wu-Palmer Similarity:
  dog - cat: 0.8571
  vehicle - car: 0.8000
  dog - car: 0.4000
  whale - eagle: 0.6923


In [10]:
from nltk.corpus import wordnet_ic

# Medida basada en contenido de información
brown_ic = wordnet_ic.ic('ic-brown.dat')

print("Resnik Similarity (corpus Brown):")
print(f"  dog - cat: {dog_synset.res_similarity(cat_synset, brown_ic):.4f}")
print(f"  vehicle - car: {vehicle_synset.res_similarity(car_synset, brown_ic):.4f}")
print(f"  dog - car: {dog_synset.res_similarity(car_synset, brown_ic):.4f}")
print(f"  whale - eagle: {whale_synset.res_similarity(eagle_synset, brown_ic):.4f}")

Resnik Similarity (corpus Brown):
  dog - cat: 7.9117
  vehicle - car: 5.9218
  dog - car: 1.5318
  whale - eagle: 5.2176


b) Desambiguación de "bank" por algoritmo de Lesk

In [11]:
from nltk.wsd import lesk

sentences = [
    "I went to the bank to deposit my money.",
    "The land along the river bank has vegetation."
]

for sent in sentences:
    sense = lesk(word_tokenize(sent), 'bank')
    print(f"Frase: {sent}")
    print(f"Sentido: {sense}")
    print(f"Definición: {sense.definition()}")
    print()

Frase: I went to the bank to deposit my money.
Sentido: Synset('depository_financial_institution.n.01')
Definición: a financial institution that accepts deposits and channels the money into lending activities

Frase: The land along the river bank has vegetation.
Sentido: Synset('bank.n.01')
Definición: sloping land (especially the slope beside a body of water)



## Ejercicio 2: Análisis de sentimiento

### Apartado 2.1: Lexicones de sentimiento

In [12]:
from nltk.corpus import opinion_lexicon

positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

def sentiment_lexicon(text):
    score = 0
    tokens = nltk.word_tokenize(text)
    for token in tokens:
        if token.lower() in positive_words:
            score += 1
        if token.lower() in negative_words:
            score -= 1
    return score


opinion_1 = ("Visceral, stunning and relentless film making. Dicaprio's Herculean, almost "
            "purely physical performance and Hardy's wide-eyed intensity coupled with the almost "
            "overwhelming beauty of the landscape - those trees, the natural light, the sun peeking "
            "through the clouds, rendered the proceedings down to savage poetry. A hypnotic, "
            "beautiful, exhausting film.")

opinion_2 = ("I saw this film on Friday. For the first 40 minutes involving spoken dialogue "
            "they need not have bothered. For me the dialogue was totally unintelligible with "
            "grunting, southern states drawl, and coarse accent that made it impossible to "
            "understand what they were saying.")

opinion_3 = "It was an idiotic film that produces a magnificent fascination."

opinions = {"Opinión 1": opinion_1, "Opinión 2": opinion_2, "Opinión 3": opinion_3}

for name, text in opinions.items():
    score = sentiment_lexicon(text)
    if score > 0:
        label = "Positiva"
    elif score < 0:
        label = "Negativa"
    else:
        label = "Neutra"
    print(f"{name}: score={score} -> {label}")

Opinión 1: score=1 -> Positiva
Opinión 2: score=-4 -> Negativa
Opinión 3: score=1 -> Positiva


### Apartado 2.2: Entrenamiento de modelos

In [13]:
from nltk.corpus import movie_reviews

# Obtener fileids de cada categoría
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

# Dividir en train (75%) y test (25%)
negcutoff = int(len(negids) * 0.75)
poscutoff = int(len(posids) * 0.75)

train_negids = negids[:negcutoff]
test_negids = negids[negcutoff:]
train_posids = posids[:poscutoff]
test_posids = posids[poscutoff:]

print(f"Total documentos negativos: {len(negids)}")
print(f"Total documentos positivos: {len(posids)}")
print(f"Train: {len(train_negids) + len(train_posids)}, Test: {len(test_negids) + len(test_posids)}")

Total documentos negativos: 1000
Total documentos positivos: 1000
Train: 1500, Test: 500


a) Accuracy con lexicón en movie_reviews

In [14]:
# Evaluar el lexicón en el conjunto de test
correct = 0
test_fileids = [(fileid, 'neg') for fileid in test_negids] + [(fileid, 'pos') for fileid in test_posids]

for fileid, true_label in tqdm(test_fileids):
    text = movie_reviews.raw(fileid)
    score = sentiment_lexicon(text)
    pred_label = 'pos' if score >= 0 else 'neg'
    if pred_label == true_label:
        correct += 1

accuracy = correct / len(test_fileids)
print(f"\nAccuracy del lexicón en test: {accuracy:.4f}")

  0%|          | 0/500 [00:00<?, ?it/s]

100%|██████████| 500/500 [00:01<00:00, 345.15it/s]


Accuracy del lexicón en test: 0.7040





b) Naive Bayes

In [15]:
from nltk.classify import NaiveBayesClassifier


def features(tokens: list[str]) -> dict[str, bool]:
    """
    Transforma una lista de tokens en un diccionario de características
    para su uso con clasificadores de NLTK.

    Cada token se convierte en una clave del diccionario con valor True,
    representando su presencia en el documento.

    Parameters
    ----------
    tokens : list[str]
        Lista de tokens del documento.

    Returns
    -------
    dict[str, bool]
        Diccionario donde cada clave es un token y su valor es True.
    """
    return dict([(token, True) for token in tokens])


# Crear características usando los mismos conjuntos train/test
negfeats_train = [(features(movie_reviews.words(fileids=[f])), 'neg') for f in train_negids]
posfeats_train = [(features(movie_reviews.words(fileids=[f])), 'pos') for f in train_posids]
negfeats_test = [(features(movie_reviews.words(fileids=[f])), 'neg') for f in test_negids]
posfeats_test = [(features(movie_reviews.words(fileids=[f])), 'pos') for f in test_posids]

trainfeats = negfeats_train + posfeats_train
testfeats = negfeats_test + posfeats_test

print(f"Train: {len(trainfeats)}, Test: {len(testfeats)}")

classifier = NaiveBayesClassifier.train(trainfeats)

accuracy_nb = nltk.classify.util.accuracy(classifier, testfeats)
print(f"Accuracy Naive Bayes: {accuracy_nb:.4f}")

classifier.show_most_informative_features()

Train: 1500, Test: 500
Accuracy Naive Bayes: 0.7280
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0
