# IIC-3670 NLP UC

# PROCESAMIENTO DE LENGUAJE NATURAL

- Versiones de librerías, python 3.8.10

- numpy 1.20.3
- nltk 3.7
- gensim 4.1.2


Skip-grams

In [1]:
import pandas as pd

data_df = pd.read_csv('mbti_1.csv')
data_df.head()


Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [2]:
data_df.dropna(inplace=True)
data_df.reset_index(inplace=True,drop=True)
posts = data_df['posts'].drop_duplicates()

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer

# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words and len(t) > 2]
        words += tokens

    return words


In [4]:
corpus = []

for raw_text in posts:
    words = tokenize(raw_text)
    corpus.append(words)


In [5]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=corpus, vector_size=100, window=5, min_count=1, workers=4, sg=1)

classgensim.models.word2vec.Word2Vec(sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), comment=None, max_final_vocab=None, shrink_windows=True)

Ver documentación en https://radimrehurek.com/gensim/models/word2vec.html

In [6]:
model.wv.most_similar_cosmul(positive=['king', 'woman'], negative=['man'])

[('lion', 0.8198302984237671),
 ('xavier', 0.797498345375061),
 ('luther', 0.797051727771759),
 ('leon', 0.7965139150619507),
 ('men', 0.7946487069129944),
 ('soldier', 0.7900184392929077),
 ('disney', 0.7893798351287842),
 ('princess', 0.7873115539550781),
 ('alysaria', 0.7868064641952515),
 ('misfit', 0.7865715026855469)]

In [7]:
model.wv.similar_by_word('king')

[('lion', 0.7856584191322327),
 ('fisher', 0.7732495665550232),
 ('luther', 0.7261788845062256),
 ('alysaria', 0.7155086398124695),
 ("king's", 0.7119490504264832),
 ('leon', 0.7080392837524414),
 ('requiem', 0.7021111845970154),
 ('stephen', 0.6926408410072327),
 ('narnia', 0.687126100063324),
 ('braveheart', 0.6800704598426819)]

In [8]:
model.wv.doesnt_match(['king', 'george', 'stephen', 'truck'])

'truck'

In [9]:
model.save("word2vec.model")
model = Word2Vec.load("word2vec.model")

In [10]:
import numpy as np

vectors = np.asarray(model.wv.vectors)
labels = np.asarray(model.wv.index_to_key)

In [11]:
np.where(labels == 'king')

(array([1419], dtype=int64),)

In [12]:
vectors[1419]

array([ 0.00305053,  0.07751658,  0.19334257, -0.16354838, -0.3084254 ,
       -0.7531376 ,  0.14012639,  0.09179023,  0.2704122 , -0.09401646,
       -0.09045535, -0.50629413,  0.05695203,  0.39294928,  0.02368007,
       -0.30303207,  0.3199876 , -0.17817433,  0.14174691, -0.3060423 ,
       -0.42470118,  0.37403706,  0.05370767, -0.2862552 , -0.00318105,
        0.43385172, -0.3831685 , -0.17883936, -0.24828139,  0.11514985,
        0.11100342, -0.087402  ,  0.6498078 , -0.7976212 , -0.2301595 ,
        0.35756162, -0.03907526,  0.31204155, -0.14215145, -0.56176907,
       -0.2477252 , -0.5503688 ,  0.13246572, -0.11576701,  0.43428642,
        0.24874106, -0.26845032, -0.26166075,  0.13088122, -0.0218265 ,
        0.70565796, -0.29254815,  0.9138555 , -0.1660255 , -0.6853122 ,
        0.07010581, -0.07306837, -0.05875859, -0.11203361, -0.18895835,
        0.05033844, -0.20773807,  0.19312263, -0.70105815, -0.0266077 ,
        0.2633894 ,  0.47297943,  0.03159411, -0.1244204 ,  0.20

In [13]:
score, results=model.wv.evaluate_word_analogies('questions-words.txt')

In [14]:
score

0.13290740574102403

Pretrained models

In [15]:
import gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [16]:
w2v_google = gensim.downloader.load('word2vec-google-news-300')



In [17]:
w2v_google.most_similar_cosmul(positive=['king', 'woman'], negative=['man'])

[('queen', 0.9314123392105103),
 ('monarch', 0.858533501625061),
 ('princess', 0.8476566076278687),
 ('Queen_Consort', 0.8150269985198975),
 ('queens', 0.8099815249443054),
 ('crown_prince', 0.8089976906776428),
 ('royal_palace', 0.8027306795120239),
 ('monarchy', 0.8019613027572632),
 ('prince', 0.800979733467102),
 ('empress', 0.7958389520645142)]

In [18]:
w2v_google.similar_by_word('king')

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474),
 ('sultan', 0.5864824056625366),
 ('ruler', 0.5797567367553711),
 ('princes', 0.5646552443504333),
 ('Prince_Paras', 0.5432944297790527),
 ('throne', 0.5422105193138123)]

In [19]:
w2v_google.doesnt_match(['king', 'george', 'stephen', 'truck'])

'truck'

In [20]:
score, results = w2v_google.evaluate_word_analogies('questions-words.txt')

In [21]:
score

0.7401448525607863