In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.display import Image

In [None]:
!pip install fasttext spacy-transformers spacy

In [None]:
#%reload_ext autoreload
#%autoreload 2
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans
from sklearn import metrics
import nmslib
import spacy
import fasttext
import fasttext.util
dataset = fetch_20newsgroups(subset='all', shuffle=True, download_if_missing=True)
# http://qwone.com/~jason/20Newsgroups/

np.random.seed(123)
texts = dataset.data # Extract text
target = dataset.target # Extract target
display(len(texts))

### Glove


In [None]:
Image('../images/glove.png', width=840, height=640)



### Word2Vec
[Word2Vec paper](https://arxiv.org/pdf/1301.3781v3.pdf)

In [None]:
Image('../images/CBOW.png', width=320, height=320)

In [None]:
Image('../images/SkipGram.png', width=420, height=420)

## FastText
[FastText paper](https://arxiv.org/pdf/1607.04606.pdf). 
But are more approchable explanation can be found [here](https://amitness.com/2020/06/fasttext-embeddings/):

In [None]:
Image('../images/FastText1.png')

In [None]:
Image('../images/FastText2.png')

In [None]:
# I skip this step because it takes too long
# fasttext.util.download_model('en', if_exists='ignore')  # English


In [None]:
# consumes too much RAM for me
# ft = fasttext.load_model('cc.en.300.bin')

In [None]:
# this is how you can decrease the embedding size
# fasttext.util.reduce_model(ft, 100)
# ft.get_dimension()
# ft.save_model('cc.en.100.bin')

In [None]:
ft = fasttext.load_model('cc.en.100.bin')

In [None]:
ft.get_word_vector('I am very smart but nobody knows')

In [None]:
texts = dataset.data # Extract text
target = dataset.target # Extract target

In [None]:
X = np.array([ft.get_word_vector(t) for t in texts])

In [None]:
import numpy as np
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
# Always scale the input. The most convenient way is to use a pipeline.

clf = cross_val_score(SGDClassifier(max_iter=1000, tol=1e-3),
         X, target, scoring = 'f1_micro', cv=3
)

In [None]:
np.mean(clf)

In [None]:
clf = cross_val_score(SGDClassifier(max_iter=1000, tol=1e-3, loss="perceptron"),
         X, target, scoring = 'f1_micro', cv=3
)
np.mean(clf)

In [None]:
ft=None
del ft

### BERT embeddings

In [None]:
Image('../images/BERT1.png')

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
nlp = spacy.load(f'en_core_web_trf', disable=["tagger", "ner", "parser", "textcat"])

In [None]:
X = [doc._.trf_data.tensors[-1] for doc in nlp.pipe(texts)]

In [None]:
import pickle
with open('spacy_embeddings.pkl', 'wb') as sink:
    pickle.dump(X, sink)

In [None]:
import pickle
with open('spacy_embeddings.pkl', 'rb') as tap:
    X = pickle.load(tap)

In [None]:
X0 = np.array([i[0] for i in X])
clf = cross_val_score(SGDClassifier(max_iter=1000, tol=1e-3, loss="perceptron"),
         X0, target, scoring = 'f1_micro', cv=3
)
np.mean(clf)

In [None]:
X1 = np.array([np.mean(i, axis=0) for i in X])
clf = cross_val_score(SGDClassifier(max_iter=1000, tol=1e-3, loss="perceptron"),
         X1, target, scoring = 'f1_micro', cv=3
)
np.mean(clf)

### universal sentence encoder
https://pypi.org/project/spacy-universal-sentence-encoder/

In [None]:
!pip install spacy-universal-sentence-encoder

In [None]:
import spacy_universal_sentence_encoder
nlp = spacy_universal_sentence_encoder.load_model('en_use_md')

In [None]:
doc1 = nlp('Hi there, how are you?')
doc2 = nlp('Hello there, how are you doing today?')

In [None]:
import numpy as np
np.dot(doc1.vector, doc2.vector)

In [None]:
doc1.similarity(doc2)

In [None]:
X_md = [nlp(t).vector for t in texts]
import pickle
with open('universal_sentence_encoder_embeddings.pkl', 'wb') as sink:
    pickle.dump(X_md, sink)

In [None]:
import numpy as np
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
# Always scale the input. The most convenient way is to use a pipeline.
clf = cross_val_score(SGDClassifier(max_iter=1000, tol=1e-3, loss="perceptron"),
         X_md, target, scoring = 'f1_micro', cv=3
)
np.mean(clf)

In [None]:
# norm='l2' is default
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df = 0.3)
X1 = vectorizer.fit_transform(texts)

print(f'{X1.shape[0]}, {X1.shape[1]}')


In [None]:
clf = cross_val_score(SGDClassifier(max_iter=1000, tol=1e-3, loss="perceptron"),
         X1, target, scoring = 'f1_micro', cv=3
)
np.mean(clf)

### Sentence-Transformers
The initial [paper](https://arxiv.org/pdf/1908.10084.pdf)

In [None]:
Image('../images/sentence-transformers1.png')

In [None]:
Image('../images/triplet_loss.png')

In [None]:
!pip install nltk sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict
import json
import os
import pickle
from itertools import chain
from nltk.tokenize import sent_tokenize
import torch
import nltk
import re
nltk.download('punkt')

### multilingual sentence transformer
This is the [publication](https://arxiv.org/abs/2004.09813) on this ingenious idea.

In [None]:
Image('../images/multi_sbert.png')

In [None]:
source = '/home/martin/python/fhnw_lecture/data'
try:
    model = SentenceTransformer(\
                    os.path.join(source, 'multi-qa-mpnet-base-dot-v1.pth'), device='cpu')
except:
    model = SentenceTransformer('multi-qa-mpnet-base-dot-v1', device='cpu')
    model.save(os.path.join(source, 'multi-qa-mpnet-base-dot-v1.pth'))

model = model.to('cpu')
model.eval()

In [None]:
with torch.no_grad():
    embeddings = model.encode(texts, device='cpu')

In [None]:
import pickle
with open('sentence_transformer_embeddings.pkl', 'wb') as sink:
    pickle.dump(embeddings, sink)

In [None]:
import pickle
with open('sentence_transformer_embeddings.pkl', 'rb') as tap:
    embeddings = pickle.load(tap)

In [None]:
clf = cross_val_score(SGDClassifier(max_iter=1000, tol=1e-3, loss="perceptron"),
         embeddings, target, scoring = 'f1_micro', cv=3
)
np.mean(clf)

can we combine two different approaches? TF-IDF and the sentence embeddings?

In [None]:
from scipy import sparse
bigX = sparse.hstack([embeddings,X1])   

In [None]:
clf = cross_val_score(SGDClassifier(max_iter=1000, tol=1e-3, loss="perceptron"),
         bigX, target, scoring = 'f1_micro', cv=3
)
np.mean(clf)

In [None]:
!pip install bs4

In [None]:
import requests
response_de=requests.get('https://www.eda.admin.ch/aboutswitzerland/de/home/gesellschaft/sprachen/die-sprachen---fakten-und-zahlen.html')
response_it = requests.get('https://www.eda.admin.ch/aboutswitzerland/it/home/gesellschaft/sprachen/die-sprachen---fakten-und-zahlen.html')
response_fr = requests.get('https://www.eda.admin.ch/aboutswitzerland/fr/home/gesellschaft/sprachen/die-sprachen---fakten-und-zahlen.html')
response_en = requests.get('https://www.eda.admin.ch/aboutswitzerland/en/home/gesellschaft/sprachen/die-sprachen---fakten-und-zahlen.html')
response_es = requests.get('https://www.eda.admin.ch/aboutswitzerland/es/home/gesellschaft/sprachen/die-sprachen---fakten-und-zahlen.html')

In [None]:
from bs4 import BeautifulSoup
soup_de = BeautifulSoup(response_de.text, features="html.parser")
soup_it = BeautifulSoup(response_it.text, features="html.parser")
soup_fr = BeautifulSoup(response_fr.text, features="html.parser")
soup_en = BeautifulSoup(response_en.text, features="html.parser")
soup_es = BeautifulSoup(response_es.text, features="html.parser")

In [None]:
import re
paragraphs_de = [re.sub(r'\s{1,}', ' ', para.get_text().replace('\n', ' ')).strip() for para in soup_de.find_all("p")]
paragraphs_it = [re.sub(r'\s{1,}', ' ', para.get_text().replace('\n', ' ')).strip() for para in soup_it.find_all("p")]
paragraphs_fr = [re.sub(r'\s{1,}', ' ', para.get_text().replace('\n', ' ')).strip() for para in soup_fr.find_all("p")]
paragraphs_en = [re.sub(r'\s{1,}', ' ', para.get_text().replace('\n', ' ')).strip() for para in soup_en.find_all("p")]
paragraphs_es = [re.sub(r'\s{1,}', ' ', para.get_text().replace('\n', ' ')).strip() for para in soup_es.find_all("p")]

In [None]:
source = '/home/martin/python/fhnw_lecture/data'
try:
    model = SentenceTransformer(\
                    os.path.join(source, 'paraphrase-multilingual-mpnet-base-v2.pth'), device='cpu')
except:
    model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2', device='cpu')
    model.save(os.path.join(source, 'paraphrase-multilingual-mpnet-base-v2.pth'))

model = model.to('cpu')
model.eval()

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

all_paragraphs = paragraphs_de + paragraphs_it + paragraphs_fr + paragraphs_en + paragraphs_es
sentences = [s for p in all_paragraphs for s in sent_tokenize(p if len(p.split(' ')) > 2 else '')]

with torch.no_grad():
    multi_lang_embeddings = model.encode(sentences, device='cpu')

In [None]:
multi_lang_embeddings.shape

### visualization via u-map

In [None]:
import umap.umap_ as umap
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

reducer = umap.UMAP()

In [None]:
h, w = multi_lang_embeddings.shape
normalized = multi_lang_embeddings/\
      np.resize(np.linalg.norm(multi_lang_embeddings, axis=1), (h, 1))

In [None]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper
from bokeh.palettes import Spectral10

output_notebook()

In [None]:
umap_transformed = reducer.fit_transform(normalized)

In [None]:
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html
from bokeh.plotting import figure, output_file, save

embedding_df = pd.DataFrame(umap_transformed, columns=('x', 'y'))
embedding_df['sentences'] = sentences
# embedding_df['image'] = list(map(embeddable_image, digits.images))

datasource = ColumnDataSource(embedding_df)
color_mapping = CategoricalColorMapper(factors=[str(i) for i in np.arange(0,9)],
                                       palette=Spectral10)
output_file(filename='/home/martin/python/fhnw_lecture/images/multilang_umap.html', title = 'sentence similarity')
plot_figure = figure(
    title='UMAP projection of sentence embeddings',
    plot_width=1000,
    plot_height=800,
    tools=('pan, wheel_zoom, reset')
)

plot_figure.add_tools(HoverTool(tooltips="""
<div>
    <div>
        <img src='@image' style='float: left; margin: 5px 5px 5px 5px'/>
    </div>
    <div>
        <span style='font-size: 16px; color: #224499'>sentence:</span>
        <span style='font-size: 18px'>@sentences</span>
    </div>
</div>
"""))

plot_figure.circle(
    'x',
    'y',
    source=datasource,
    color=dict(field='sentences',transform=color_mapping),
    line_alpha=0.6,
    fill_alpha=0.6,
    size=4
)
save(plot_figure)
# show(plot_figure)

### Monolingual models can be found on 'the Huggingface':
best [german model](https://huggingface.co/T-Systems-onsite/german-roberta-sentence-transformer-v2)