# Purpose
This notebook tests loading fastText embeddings and converting the German-language POSTS.


# Notebook setup

In [1]:
%load_ext autoreload
%autoreload 2

In [155]:
import gc
import os

import fse
from fse.models import uSIF
import gensim
from gensim.models.fasttext import FastText, load_facebook_vectors

import numpy as np
import pandas as pd
import plotly
import plotly.express as px

from subclu.data.fasttext import (
    download_ft_pretrained_model,
    get_df_for_most_similar
)
from subclu.utils import set_working_directory
from subclu.utils.eda import (
    setup_logging, counts_describe, value_counts_and_pcts,
    notebook_display_config, print_lib_versions,
    style_df_numeric
)
from subclu.eda.aggregates import (
    compare_raw_v_weighted_language
)
from subclu.utils.data_irl_style import (
    get_colormap, theme_dirl
)


print_lib_versions([fse, gensim, np, pd, plotly])

python		v 3.7.10
===
fse		v: 0.1.15
gensim		v: 3.8.3
numpy		v: 1.19.5
pandas		v: 1.2.4
plotly		v: 4.14.3


In [30]:
# plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib.dates as mdates
plt.style.use('default')

setup_logging()
notebook_display_config()

# Load posts

In [31]:
%%time

project_id='data-science-prod'
bucket_name = 'i18n-subreddit-clustering'
folder_posts = 'posts/2021-05-10'

l_cols_load = [
    # IDs
    'subreddit_name',
    'subreddit_id',
    'post_id',
#     'user_id',
    'thing_type',
    
    # Meta
#     'submit_date',
#     'removed',
#     'upvotes',
#     'successful',
#     'app_name',
    'post_type',
    'post_nsfw',
    'geolocation_country_code',
    
    # Language & text content
    'post_url',
    'language',
    'probability',
    'weighted_language',
    'weighted_language_probability',
    'text_len',
    'text_word_count_estimate',
    'text'
]

df_posts = pd.read_parquet(path=f"gs://{bucket_name}/{folder_posts}",
                           columns=l_cols_load)

CPU times: user 522 ms, sys: 89.4 ms, total: 611 ms
Wall time: 6.63 s


In [11]:
df_posts.shape

(26567, 15)

In [32]:
# df_posts.info(memory_usage='deep')

In [34]:
counts_describe(df_posts)

Unnamed: 0,dtype,count,unique,unique-percent,null-count,null-percent
subreddit_name,object,26567,84,0.32%,0,0.00%
subreddit_id,object,26567,84,0.32%,0,0.00%
post_id,object,26567,26567,100.00%,0,0.00%
thing_type,object,26567,1,0.00%,0,0.00%
post_type,object,26567,9,0.03%,0,0.00%
post_nsfw,object,26366,2,0.01%,201,0.76%
geolocation_country_code,object,26567,108,0.41%,0,0.00%
post_url,object,26055,25646,98.43%,512,1.93%
language,object,26567,72,0.27%,0,0.00%
probability,float64,26567,22145,83.36%,0,0.00%


# Download pre-trained model

**WARNING** If this is the first time running the notebook, **it can take 10+ minutes to download** the pretrained fastText model.

fastText documentation:
- https://fasttext.cc/docs/en/crawl-vectors.html
- https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md
> We distribute pre-trained word vectors for 157 languages, trained on Common Crawl and Wikipedia using fastText. These models were trained using CBOW with position-weights, in dimension 300, with character n-grams of length 5, a window of size 5 and 10 negatives.


In [35]:
%%time

f_ft_model_de = download_ft_pretrained_model(lang_id='de', if_exists='ignore')
f_ft_model_de

Saving embeddings to:
  /home/jupyter/subreddit_clustering_i18n/data/embeddings/fasttext


PosixPath('/home/jupyter/subreddit_clustering_i18n/data/embeddings/fasttext/cc.de.300.bin')

# Load embeddings

ETA for loading model on instance: `N1-standard, 16 CPUs 60GB RAM`:

| Language | Function | Time  | RAM  |
|---|---|---|---|
| German (de)  | `load_facebook_vectors` | 1min 30s | ~12GB  |
| English (en) | `load_facebook_vectors` |  ?? | ??  |


---
### Reference / Docs:
- `gensim` fasttext API reference: https://radimrehurek.com/gensim/models/fasttext.html
- `gensim` fasttext Tutorial/training new model: https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html
- `fse` tutorial: https://towardsdatascience.com/vis-amz-83dea6fcb059
- `fse` github: https://github.com/oborchers/Fast_Sentence_Embeddings

**Note**: `load_fasttext_format` is deprecated. When using fse+gensim, best to go with `load_facebook_vectors` to reduce required RAM unless/until we fine-tune existing model(s).

> DeprecationWarning: Call to deprecated `load_fasttext_format`. Instead, use: 
> - `load_facebook_vectors` (to use pretrained embeddings) or 
> - `load_facebook_model` (to continue training with the loaded full model, more RAM).

In [45]:
%%time

ft_de = load_facebook_vectors(str(f_ft_model_de))

CPU times: user 1min 22s, sys: 8.04 s, total: 1min 30s
Wall time: 1min 30s


In [196]:
gc.collect()

6989

# Basic checks
- vocab size
- any difference between upper case & lower case?
- how do emojis get handled? 
  - i.e., do we need to filter them out before embeddings?

## This model has 2,000,000 words in its vocabulary

In [211]:
f"{len(ft_de.vocab):,.0f}"

'2,000,000'

## Check generic words & compare case
This FastText model includes case information so `Hallo` and `hallo` have slightly different meanings but they're still closely related.

In [204]:
ft_de.similarity('hallo', 'Hallo')

0.8284326

In [210]:
get_df_for_most_similar(ft_de, ['Hallo', 'hallo'], print_oov_check=True)

True -> Hallo in vocabulary?
True -> hallo in vocabulary?


Unnamed: 0,'Hallo' similar_words,'Hallo' similarity_score,Unnamed: 3,'hallo' similar_words,'hallo' similarity_score
0,Hallöchen,0.829399,|,Hallo,0.828433
1,hallo,0.828433,|,hallöchen,0.779361
2,Moin,0.814899,|,hallo.,0.731166
3,Huhu,0.768263,|,Hallöchen,0.723373
4,Hallihallo,0.760667,|,hallöle,0.723245
5,Hey,0.748436,|,Hey,0.707068
6,.Hallo,0.736457,|,huhu,0.703696
7,Hallöle,0.728758,|,hallihallo,0.696369
8,Hallo.,0.718392,|,moin,0.688673
9,hallöchen,0.713206,|,halloich,0.685797


### `Halo` v `Hallo`
We see that `Halo` is associated with other videogames. Even in lower case (`halo`) is still mostly associated with videogames -- one of the known drawbacks of static embeddings.

In [206]:
ft_de.similarity('Hallo', 'Halo')

0.21595238

In [209]:
get_df_for_most_similar(ft_de, ['Halo', 'halo'], print_oov_check=True)

True -> Halo in vocabulary?
True -> halo in vocabulary?


Unnamed: 0,'Halo' similar_words,'Halo' similarity_score,Unnamed: 3,'halo' similar_words,'halo' similarity_score
0,Halo-Serie,0.655669,|,halos,0.573554
1,Halo-Universum,0.650042,|,Halo,0.571563
2,Halo3,0.637668,|,xbox360,0.52677
3,Halo-Reihe,0.636786,|,xbox,0.525676
4,Killzone,0.605346,|,Halo3,0.519317
5,Halo2,0.601156,|,farcry,0.510365
6,Battlefront,0.595721,|,bioshock,0.507828
7,Gears,0.587074,|,fallout,0.498994
8,TMCC,0.580518,|,tekken,0.496574
9,ODST,0.580247,|,skyrim,0.493982


## The model includes names of some political figures

### Countries & political figures

In [230]:
get_df_for_most_similar(ft_de, ['Merkel', 'Biden', 'Trump'], print_oov_check=True)

True -> Merkel in vocabulary?
True -> Biden in vocabulary?
True -> Trump in vocabulary?


Unnamed: 0,'Merkel' similar_words,'Merkel' similarity_score,Unnamed: 3,'Biden' similar_words,'Biden' similarity_score,Unnamed: 6,'Trump' similar_words,'Trump' similarity_score
0,Kanzlerin,0.823135,|,Bidens,0.643363,|,Trumps,0.834379
1,Merkels,0.773008,|,Obama,0.590702,|,Obama,0.769192
2,Bundeskanzlerin,0.76059,|,Clinton,0.577105,|,Clinton,0.710054
3,Steinmeier,0.747596,|,McCain,0.557189,|,Trump-,0.702948
4,Schäuble,0.740701,|,Bush,0.548116,|,Anti-Trump,0.691796
5,Gauck,0.707773,|,Poroschenko,0.519384,|,Trump-Regierung,0.6903
6,Seehofer,0.703044,|,Cheney,0.514759,|,Trump-Lager,0.686202
7,.Merkel,0.696072,|,US-Vizepräsident,0.509718,|,Obamas,0.67244
8,Westerwelle,0.673326,|,Donilon,0.509373,|,US-Präsident,0.6722
9,Putin,0.671824,|,US-Vize,0.506996,|,Trump.,0.669235


In [177]:
ft_de.similarity('Deutschland', 'Germany')

0.5478157

In [189]:
style_df_numeric(get_df_for_most_similar(ft_de, ['Deutschland', 'Germany'], print_oov_check=True))

True -> Deutschland in vocabulary?
True -> Germany in vocabulary?


Unnamed: 0,Deutschland similar_words,similarity_score-Deutschland,Unnamed: 3,Germany similar_words,similarity_score-Germany
0,Österreich,0.74,|,Germany-,0.72
1,Europa,0.74,|,germany,0.68
2,Bundesrepublik,0.71,|,Germany.,0.65
3,Schweiz,0.7,|,GERMANY,0.63
4,Frankreich,0.67,|,Europe,0.62
5,Deuschland,0.65,|,Gemany,0.62
6,Großbritannien,0.65,|,Switzerland,0.61
7,Süddeutschland,0.63,|,West-Germany,0.6
8,Italien,0.63,|,Germany4,0.59
9,Amerika,0.63,|,Germany2,0.59


## The model also includes _some_ English words

### Sample English v. German sport terms

In [178]:
ft_de.similarity('soccer', 'Fußball')

0.43577126

In [212]:
style_df_numeric(get_df_for_most_similar(ft_de, ['soccer', 'Fußball', 'fußball'], print_oov_check=True))

True -> soccer in vocabulary?
True -> Fußball in vocabulary?
True -> fußball in vocabulary?


Unnamed: 0,'soccer' similar_words,'soccer' similarity_score,Unnamed: 3,'Fußball' similar_words,'Fußball' similarity_score,Unnamed: 6,'fußball' similar_words,'fußball' similarity_score
0,football,0.65,|,Fussball,0.82,|,fußballer,0.7
1,Soccer,0.61,|,Fußballs,0.69,|,Fußball,0.65
2,footballer,0.58,|,Fußbal,0.69,|,fussball,0.65
3,rugby,0.58,|,Fußballsport,0.68,|,fußballspiel,0.64
4,soccer.at,0.54,|,FußballFußball,0.66,|,fußballverein,0.63
5,Streetsoccer,0.53,|,fußball,0.65,|,ballsport,0.63
6,athletic,0.52,|,Fußballer,0.65,|,fußballspieler,0.63
7,sporting,0.52,|,Handball,0.65,|,fußballspielen,0.62
8,sports,0.52,|,Fußball.,0.64,|,Fussball,0.61
9,Football,0.52,|,Fußballspiel,0.64,|,handball,0.61


### Some more recent terms are not part of the vocabulary

`Corona` & `coronavirus` have been around for a while, but `COVID` was not in the training data and shows up as "similar" to long and odd strings of text.

In [226]:
pd.set_option('display.max_colwidth', 50)
display(get_df_for_most_similar(ft_de, ['corona', 'coronavirus', 'covid'], print_oov_check=True))
pd.set_option('display.max_colwidth', 240)

True -> corona in vocabulary?
True -> coronavirus in vocabulary?
True -> covid in vocabulary?


Unnamed: 0,'corona' similar_words,'corona' similarity_score,Unnamed: 3,'coronavirus' similar_words,'coronavirus' similarity_score,Unnamed: 6,'covid' similar_words,'covid' similarity_score
0,coronae,0.699097,|,Coronavirus,0.755018,|,sehensw,0.476563
1,coronata,0.67023,|,Coronaviren,0.701213,|,2270800024h,0.452691
2,corone,0.651416,|,Alphavirus,0.689634,|,20171Sa2So3Mo4Di5Mi6Do7Fr8Sa9So10Mo11Di12Mi13D...,0.452116
3,coronaria,0.635078,|,Coronaviridae,0.685821,|,STANDARDJUNKERSKALDEWEIKERAMAGKEUCOLAUFENOVENT...,0.444858
4,coronation,0.633618,|,Bocavirus,0.657691,|,20171Di2Mi3Do4Fr5Sa6So7Mo8Di9Mi10Do11Fr12Sa13S...,0.442462
5,coronatus,0.610565,|,Parainfluenzavirus,0.655467,|,2RgYT2iGt9hGht0RHc3xiDib3o8,0.441883
6,coronare,0.606693,|,Bornavirus,0.649085,|,e34de00e-f716-4448-abfb-5d911c3859c6,0.439386
7,coronarium,0.585234,|,Cytomegalovirus,0.648879,|,ARIArieteArrowheadAthenaBeruBMWBoschBuzettiCha...,0.429309
8,bronce,0.541351,|,Arenaviren,0.647243,|,ROMEO5ARO8AUDI9AUSTIN12AUTOBIANCHI18BMW22CADIL...,0.427667
9,coronary,0.540825,|,Papillomavirus,0.642728,|,20181Do2Fr3Sa4So5Mo6Di7Mi8Do9Fr10Sa11So12Mo13D...,0.427303


### Emoji

In [227]:
ft_de.most_similar("🇩🇪")

[('Methodenauswahl', 0.0),
 ('US-Notenbankchefin', 0.0),
 ('Johann-Friedrich-Danneil-Museum', 0.0),
 ('Klaus-Harms-Schule', 0.0),
 ('Issole', 0.0),
 ('Zwischenschalten', 0.0),
 ('entkoppelter', 0.0),
 ('Emmily', 0.0),
 ('Zeitschriftenausgaben', 0.0),
 ('Junqueiro', 0.0)]

In [195]:
ft_de.most_similar("🐸")

[('Methodenauswahl', 0.0),
 ('US-Notenbankchefin', 0.0),
 ('Johann-Friedrich-Danneil-Museum', 0.0),
 ('Klaus-Harms-Schule', 0.0),
 ('Issole', 0.0),
 ('Zwischenschalten', 0.0),
 ('entkoppelter', 0.0),
 ('Emmily', 0.0),
 ('Zeitschriftenausgaben', 0.0),
 ('Junqueiro', 0.0)]

In [228]:
ft_de.most_similar("ich🇩🇪iel")

[('081', 0.5792489051818848),
 ('041', 0.5722817182540894),
 ('071', 0.5150038003921509),
 ('079', 0.5042275190353394),
 ('062', 0.4915590286254883),
 ('Schnebli', 0.48472845554351807),
 ('044', 0.4835275411605835),
 ('055', 0.4824221432209015),
 ('CH-1712', 0.4817127585411072),
 ('Schäfli', 0.48153847455978394)]

In [92]:
ft_de.most_similar("ich🐸iel")

[('HelmRitterhelm', 0.5055542588233948),
 ('Halbschalenhelm', 0.4618237316608429),
 ('Dachhelm', 0.45038658380508423),
 ('Sporthelm', 0.44350433349609375),
 ('KaplaKaspar', 0.44235432147979736),
 ('AccessWissenstransferKarl', 0.4404061436653137),
 ('Kammhelm', 0.44026464223861694),
 ('mattReflektor', 0.43612435460090637),
 ('Lampin', 0.4357553720474243),
 ('volkmar', 0.4348251223564148)]

In [68]:
ft_de.most_similar("iel")

[('v', 0.4977104663848877),
 ('rn', 0.48465555906295776),
 ('nd', 0.4633084535598755),
 ('wn', 0.45827677845954895),
 ('iele', 0.45819616317749023),
 ('ar', 0.45698654651641846),
 ('ebt', 0.455821692943573),
 ('z', 0.4551096558570862),
 ('eiß', 0.4541691541671753),
 ('ery', 0.45227232575416565)]

In [69]:
ft_de.most_similar("IEL")

[('OML', 0.5013827085494995),
 ('JEL', 0.4898484945297241),
 ('CIAA', 0.479758083820343),
 ('NHE', 0.4766197204589844),
 ('IEW', 0.46671175956726074),
 ('IDI', 0.4656735956668854),
 ('ACPA', 0.4605969786643982),
 ('CIFE', 0.45994144678115845),
 ('CEPE', 0.4564463794231415),
 ('IAL', 0.4564042091369629)]

In [71]:
ft_de.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.5639194846153259),
 ('women', 0.5232468843460083),
 ('lady', 0.5213222503662109),
 ('kings', 0.5168982744216919),
 ('clad', 0.5126449465751648),
 ('wives', 0.510648250579834),
 ('eyed', 0.5080440044403076),
 ('fiery', 0.4963497221469879),
 ('crown', 0.4944778084754944),
 ('masterpiece', 0.49412137269973755)]

In [72]:
ft_de.most_similar(positive=['Frau', 'König'], negative=['Mann'])

[('Königin', 0.6669619679450989),
 ('Königs', 0.606950044631958),
 ('Elisabeth', 0.5830625295639038),
 ('Prinzessin', 0.5826852321624756),
 ('Beatrix', 0.5708266496658325),
 ('Katharina', 0.5664329528808594),
 ('Margarete', 0.5610044002532959),
 ('Ludwig', 0.557565450668335),
 ('Heinrich', 0.5511189699172974),
 ('Kaiserin', 0.548440158367157)]

# Prepare text for embedding conversion

> If you import the FAST_VERSION variable as follows you can ensure, that the compiliation of the cython routines worked correctly:
>
> `print(FAST_VERSION) == 1 -> The fast version works`



In [None]:
TODO

In [234]:
from fse.models.average import FAST_VERSION, MAX_WORDS_IN_BATCH
from fse import SplitCIndexedList

print(MAX_WORDS_IN_BATCH)
print(FAST_VERSION)
# 1 -> The fast version works

10000
1


In [233]:
fse_usif = uSIF(ft_de, workers=4, length=11, lang_freq="en")

## Convert the sentences to the fse expected format

In [248]:
# df_posts.head()

By default, `SplitCIndexedList` only splits by spaces. Consider a better tokenizer, like the one used in the STS benchmarks: https://github.com/oborchers/Fast_Sentence_Embeddings/blob/master/notebooks/STS-Benchmarks.ipynb

In [241]:
n_sample = 30
d_post_id_to_ix = {
    post_id: ix for ix, post_id in enumerate(df_posts['post_id'].head(n_sample))
}


indexed_posts = SplitCIndexedList(
    df_posts['text'].head(n_sample).values,
    custom_index=[d_post_id_to_ix[post_id] for post_id in df_posts['post_id'].head(n_sample)]
)

In [242]:
indexed_posts[1]

(['ich🍺iel'], 1)

In [243]:
indexed_posts[3]

(['ich\U0001f971💤iel'], 3)

In [247]:
indexed_posts[-2]

(['VOR',
  'EU-TREFFEN',
  'MIT',
  'ERDOGAN',
  'Manfred',
  'Weber',
  'nennt',
  'türkischen',
  'EU-Beitritt',
  '„Illusion'],
 28)

In [246]:
indexed_posts.items[-2]

'VOR EU-TREFFEN MIT ERDOGAN Manfred Weber nennt türkischen EU-Beitritt „Illusion '

## Train on a sample of posts

In [249]:
%%time

fse_usif.train(indexed_posts)

CPU times: user 8.64 s, sys: 324 ms, total: 8.97 s
Wall time: 8.44 s


(30, 410)

In [251]:
%%time

vectorized_posts = fse_usif.infer(indexed_posts)

CPU times: user 33.3 ms, sys: 560 µs, total: 33.8 ms
Wall time: 31.6 ms


In [252]:
vectorized_posts.shape

(30, 300)