This notebook downloads FastText from **Facebook Research** and reduces dimensionality from 300 to 32

*IMPORTANT:* run following commands to create `/opt/dsmodels`:
```shell
sudo mkdir -p {PROJECT_ROOT}/ext_models/facebookresearch/fastText
sudo chmod -R 777 {PROJECT_ROOT}/ext_models/facebookresearch
```

In [None]:
!pip install fasttext
!pip install ipywidgets

In [1]:
import fasttext.util
import sys
from ipywidgets import IntProgress, HTML, VBox
from IPython.display import display

# Create a progress bar widget
progress_bar = IntProgress(min=0, max=100)
progress_label = HTML('Downloading model...')
progress_box = VBox([progress_label, progress_bar])
display(progress_box)

# Custom progress handler
class JupyterProgress:
    def write(self, message):
        if '(' in message and '%' in message:
            try:
                percentage = float(message.split('(')[1].split('%')[0])
                progress_bar.value = percentage
                progress_label.value = f'Downloading model... {percentage:.2f}%'
            except ValueError:
                pass

    def flush(self):
        pass

sys.stdout = JupyterProgress()  # Redirect stdout to the progress handler
fasttext.util.download_model('en', if_exists='ignore')  # Download English model
sys.stdout = sys.__stdout__     # Restore stdout

# Completion message
progress_label.value = 'Download completed!'

VBox(children=(HTML(value='Downloading model...'), IntProgress(value=0)))

In [2]:
from os import path
import fasttext

from utils.lm_components import DSMODELS_PREFIX, FQFP_MODEL_FASTTEXT_D32, MODEL_FASTTEXT_D32, MODEL_FASTTEXT_D300, FAST_TEXT_MODEL

2025-02-06 12:00:30,092 - tensorcraft - INFO - XLA Device Not Supported: No module named 'torch_xla'
2025-02-06 12:00:30,104 - tensorcraft - INFO - Pytorch version=2.6.0 preferred device=mps build with MPS support=True


In [3]:
ft = fasttext.load_model(MODEL_FASTTEXT_D300)
print(f'original model dimension={ft.get_dimension()}')

fasttext.util.reduce_model(ft, 32)
print(f'reduced model dimension={ft.get_dimension()}')

In [6]:
ft.save_model(FQFP_MODEL_FASTTEXT_D32)

In [7]:
# this cell moves `FT_MODEL_D300` model from current folder to `{PROJECT_ROOT}/ext_models/facebookresearch/fastText`
import shutil

target_location = path.join(DSMODELS_PREFIX, FAST_TEXT_MODEL, MODEL_FASTTEXT_D300)
try:
    destination = shutil.move(MODEL_FASTTEXT_D300, target_location)
    print(f'File {MODEL_FASTTEXT_D300} successfully moved to {destination}')
except FileNotFoundError:
    print(f'Source file {MODEL_FASTTEXT_D300} not found!')
except PermissionError:
    print(f'Permission denied. Ensure you have write access to {target_location}.')

## Now that the FastText model has been written, load it via standard tools

In [2]:
from xplainproj.classifier.textcode_features import compute_ft_embeddings
from utils.lm_components import load_ft_model, FQFP_MODEL_FASTTEXT_D32

ft_model = load_ft_model(FQFP_MODEL_FASTTEXT_D32)

2025-02-06 12:12:05,346 - tensorcraft - INFO - XLA Device Not Supported: No module named 'torch_xla'
2025-02-06 12:12:05,359 - tensorcraft - INFO - Pytorch version=2.6.0 preferred device=mps build with MPS support=True


In [3]:
# dimensionality of the model output
ft_model.get_dimension()

32

In [4]:
text = 'C:\\Program Files\\NewApp\\app.exe'
embedding = compute_ft_embeddings(text, ft_model)
print('Generated Embedding:', embedding)


Generated Embedding: [-0.12611352 -0.10338781  0.21851552 -0.09466213 -0.00374112  0.03546005
  0.12628756  0.02672722  0.15132128 -0.07531264 -0.01375438  0.00943745
 -0.02829693  0.15084867 -0.09292743  0.02572016  0.01687725 -0.03520642
  0.00847062  0.01207607 -0.01216534  0.12901102 -0.00711891 -0.0695353
 -0.03802507  0.06194637 -0.06529724 -0.02901707  0.05913709 -0.00373161
  0.09857754 -0.0314456 ]


In [5]:
ft_model

<fasttext.FastText._FastText at 0x12ac17aa0>