In [None]:
# !pip install fasttext
# !pip install gensim

In [None]:
import gensim 
import logging

### NPR Media Dialog Dataset Overview

* Dataset Specifications (npr.org archives):
  * 140,000+ NPR radio interview transcripts
  * 20-year temporal coverage
  * 10,000+ hours of transcribed audio content



* Available via Kaggle platform
  * [kaggle.com/datasets/shuyangli94/interview-npr-media-dialog-transcripts](kaggle.com/datasets/shuyangli94/interview-npr-media-dialog-transcripts)

In [None]:
with open("./media/npr_1000_utterances.csv", 'r') as f:
    i = 0 
    for i,line in enumerate (f):
        print(line)
        if i ==3:
            break
        i += 1
        

In [None]:
import csv
reader = csv.reader(open("./media/npr_1000_utterances.csv"), delimiter=',', quotechar='"')
for row in reader:
    print(row)
    break

We will use `simple_preprocess` to   lowercases, tokenizes, de-accent the a string.
The output of `simple_preprocess` are final tokens = unicode strings.



In [None]:
some_text = "WWW.google.com So #$test in ~every! time! What?"

gensim.utils.simple_preprocess(some_text) 

In [None]:
def read_input(input_file):
    utterances = []
    with open(input_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader, None)
        for row in reader:
            text = row[-1]  
            yield gensim.utils.simple_preprocess(text)  # Yield the preprocessed text

utterances = list(read_input("./media/npr_1000_utterances.csv"))


In [None]:
len(utterances)


In [None]:
utterances[20][0:10]

We will use `gensim` to train a `Word2Vec` model on the 1000 utterances

https://radimrehurek.com/gensim/models/word2vec.html

In [None]:
model = gensim.models.Word2Vec(utterances, window=10, min_count=2, workers=10)
model

In [None]:
model.wv.key_to_index 

In [None]:
len(model.wv.key_to_index)

In [None]:
model.wv.key_to_index.get("washington")

In [None]:
model.wv.key_to_index.get("chicago")

In [None]:
model.wv.key_to_index.get("tokyo") == None

### Question 1.
In the code above, we see that Washington is set to index 406, and Chicago is set to index 124. Why is Tokyo set to None?

In [None]:
model.wv["washington"].size

In [None]:
model.wv["washington"]

In [None]:
w1 = ["washington"]
model.wv.most_similar(positive=w1, topn=6)

## Question 2. 
When searching for the words most similar to 'Washington', we get results like 'his', 'from', etc., which are clearly not semantically similar to the word 'Washington'. Why is that? Didn't we show that Word2Vec does a good job of grouping semantically similar words, like city names?

### Some issues with the embeddings

-- Add text after pracitcal

In [None]:
utterances = list(read_input("./media/npr_100000_utterances.csv"))
model = gensim.models.Word2Vec(utterances, window=10, min_count=2, workers=10)
model

In [None]:
w1 = ["peace"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
w1 = ["france"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
w1 = ["clean"]
model.wv.most_similar (positive=w1,topn=6)

In [None]:
### Question 3.

Why do the embeddings seem more specific now? Can you explain?

### Facebook's FastText
```FastText is an open-source, free, lightweight library that allows users to learn text representations and text classifiers. It works on standard, generic hardware. Models can later be reduced in size to even fit on mobile devices.```


![](https://www.dropbox.com/s/i74guibnv5mxx2h/fasttext.png?dl=1)

https://fasttext.cc/

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip

In [None]:
!mv wiki-news-300d-1M.vec.zip media/
!unzip -f media/wiki-news-300d-1M.vec.zip

In [None]:
!head ./media/words_to_keep

In [None]:
keep_words= [x.rstrip() for x in open("./media/words_to_keep")]
keep_words

In [None]:
import numpy as np
words_embeds = {}
for line in open("media/wiki-news-300d-1M.vec"):
    data = line.split()
    if data[0] in keep_words:
        words_embeds[data[0]] = np.array(list(map(float, data[1:])))


In [None]:
words_embeds["big"].size


In [None]:
words_embeds.keys()

In [None]:
res = words_embeds["girl"] - words_embeds["boy"] + words_embeds["brother"]
res

In [None]:
(res - words_embeds["sister"]).round(2)

In [None]:
words_embeds["big"] - words_embeds["bigger"]

In [None]:
words_embeds["bad"] - words_embeds["worse"]

In [None]:
(words_embeds["bad"] - words_embeds["worse"]) - (words_embeds["big"] - words_embeds["bigger"])

In [None]:
# https://github.com/facebookresearch/faiss
# !pip install faiss
dbutils.fs.ls("dbfs:/FileStore/")

In [None]:
import faiss

In [None]:
words = np.array(list(words_embeds.keys()))
embeds = np.array(list(words_embeds.values()))

In [None]:
index = faiss.IndexFlatL2(300)
index.add(embeds)   

In [None]:
np.array([words_embeds["bad"]])

In [None]:
index.search(np.array([words_embeds["bad"]]), k=3)

In [None]:
words

In [None]:
words[0]

In [None]:
index.is_trained

### Demo from Faiss: The Missing Manual
- Demo posted on Pinecone's website

  - [Faiss: The Missing Manual](https://www.pinecone.io/learn/series/faiss/faiss-tutorial/)