# Downloading and Building fastText

In [1]:
# Downloading fastText
!wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
!unzip v0.9.2.zip

--2024-10-09 10:16:17--  https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
Resolving github.com (github.com)... 4.237.22.38
Connecting to github.com (github.com)|4.237.22.38|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/facebookresearch/fastText/zip/refs/tags/v0.9.2 [following]
--2024-10-09 10:16:18--  https://codeload.github.com/facebookresearch/fastText/zip/refs/tags/v0.9.2
Resolving codeload.github.com (codeload.github.com)... 4.237.22.35
Connecting to codeload.github.com (codeload.github.com)|4.237.22.35|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: 'v0.9.2.zip'

v0.9.2.zip              [       <=>          ]   4.17M  2.91MB/s    in 1.4s    

2024-10-09 10:16:19 (2.91 MB/s) - 'v0.9.2.zip' saved [4369852]

Archive:  v0.9.2.zip
5b5943c118b0ec5fb9cd8d20587de2b2d3966dfe
   creating: fastText-0.9.2/
   creating: fastText-0.9.2/.circleci/
  infla

In [2]:
# Moving to the fastText directory and building it
%cd fastText-0.9.2
!make

/Users/ruslankhissamiyev/Desktop/ADS Final Presentation/ADSFinal/fastText-0.9.2
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/args.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/autotune.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/matrix.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/dictionary.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/loss.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/productquantizer.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/densematrix.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/quantmatrix.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/vector.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c src/model.cc
c++ -pthread -std=c++11 -march=native -O3 -funroll-loops -DNDEBUG -c 

In [3]:
# Testing if fastText is working
!./fasttext

usage: fasttext <command> <args>

The commands supported by fasttext are:

  supervised              train a supervised classifier
  quantize                quantize a model to reduce the memory usage
  test                    evaluate a supervised classifier
  test-label              print labels with precision and recall scores
  predict                 predict most likely labels
  predict-prob            predict most likely labels with probabilities
  skipgram                train a skipgram model
  cbow                    train a cbow model
  print-word-vectors      print word vectors given a trained model
  print-sentence-vectors  print sentence vectors given a trained model
  print-ngrams            print ngrams given a trained model and word
  nn                      query for nearest neighbors
  analogies               query for analogies
  dump                    dump arguments,dictionary,input/output vectors



# Downloading and Preparing the Data

In [4]:
# Go back to the parent directory
%cd ..

# Downloading files from Tatoeba
!wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
!bunzip2 sentences.tar.bz2
!tar xvf sentences.tar

/Users/ruslankhissamiyev/Desktop/ADS Final Presentation/ADSFinal
URL transformed to HTTPS due to an HSTS policy
--2024-10-09 10:18:21--  https://downloads.tatoeba.org/exports/sentences.tar.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 195102325 (186M) [application/octet-stream]
Saving to: 'sentences.tar.bz2'


2024-10-09 10:19:02 (4.86 MB/s) - 'sentences.tar.bz2' saved [195102325/195102325]

x sentences.csv


In [5]:
# Preparing the data for fastText
!awk -F"\t" '{print "__label__"$2" "$3}' sentences.csv | shuf > all.txt

In [6]:
# Splitting the data into training and testing sets
!head -n 10000 all.txt > valid.txt
!tail -n +10001 all.txt > train.txt

# Training the Initial Model

In [7]:
# Training the model
!fastText-0.9.2/fasttext supervised -input train.txt -output langdetect -dim 16

Read 100M words
Number of words:  4556997
Number of labels: 420
Progress: 100.0% words/sec/thread:  253825 lr:  0.000000 avg.loss:  0.138132 ETA:   0h 0m 0sm 3ss


In [8]:
# Testing the model
!fastText-0.9.2/fasttext test langdetect.bin valid.txt

N	10000
P@1	0.953
R@1	0.953


# Adding Kazakh Language Support

In [9]:
# Checking the number of Kazakh sentences
!awk -F"\t" '$2 == "kaz"' sentences.csv | wc -l

    4335


In [10]:
# Extracting Kazakh sentences
!awk -F"\t" '$2 == "kaz" {print "__label__"$2" "$3}' sentences.csv > kazakh_sentences.txt

# Appending Kazakh sentences to the dataset
!cat kazakh_sentences.txt >> all.txt

# Shuffling the dataset
!shuf all.txt -o all_shuffled.txt

In [11]:
# Use 10,000 samples for validation
!head -n 10000 all_shuffled.txt > valid.txt

# Use the rest for training
!tail -n +10001 all_shuffled.txt > train.txt

# Retraining the Model with Kazakh

In [12]:
# Retraining the model with Kazakh sentences
!fastText-0.9.2/fasttext supervised -input train.txt -output langdetect -dim 16

Read 100M words
Number of words:  4556997
Number of labels: 420
Progress: 100.0% words/sec/thread:  191641 lr:  0.000000 avg.loss:  0.167677 ETA:   0h 0m 0s  0h 1m29s lr:  0.037260 avg.loss:  0.233439 ETA:   0h 1m21s 63.6% words/sec/thread:  190871 lr:  0.036431 avg.loss:  0.231452 ETA:   0h 1m20s 69.7% words/sec/thread:  190304 lr:  0.030346 avg.loss:  0.218402 ETA:   0h 1m 6s  0h 0m53s avg.loss:  0.203070 ETA:   0h 0m49sm49s  0h 0m47s  0h 0m39s0h 0m25ss 92.3% words/sec/thread:  191393 lr:  0.007660 avg.loss:  0.177667 ETA:   0h 0m16ss0h 0m 5s


In [13]:
# Testing the model on the validation set
!fastText-0.9.2/fasttext test langdetect.bin valid.txt

N	10000
P@1	0.954
R@1	0.954


In [14]:
# Preparing Kazakh test set
!awk -F"\t" '$2 == "kaz" {print "__label__"$2" "$3}' sentences.csv | shuf > kazakh_test.txt

In [15]:
# Testing the model on Kazakh sentences
!fastText-0.9.2/fasttext test langdetect.bin kazakh_test.txt

N	4335
P@1	1
R@1	1


# Improving the model

In [17]:
# Retraining the model with all improvements applied
import subprocess

# Command to run FastText with all the improvements applied
command = [
    './fastText-0.9.2/fasttext', 'supervised',
    '-input', 'train.txt',        # Training data
    '-output', 'langdetect',      # Output model name prefix
    '-dim', '16',                 # Set vector dimension to 16
    '-epoch', '15',               # Increase the number of epochs to 15
    '-lr', '1.0',                 # Increase the learning rate to 1.0
    '-loss', 'hs',                # Use hierarchical softmax loss function
    '-minn', '2', '-maxn', '4',   # Use character n-grams from 2 to 4 characters (subword features)
    '-wordNgrams', '2'            # Include word bigrams (word n-grams of length 2)
]

# Run the command using subprocess
subprocess.run(command)

Read 100M words
Number of words:  4556997
Number of labels: 420
Progress: 100.0% words/sec/thread: 1124473 lr:  0.000000 avg.loss:  0.054848 ETA:   0h 0m 0s  3.3% words/sec/thread: 1115505 lr:  0.967184 avg.loss:  0.222146 ETA:   0h 1m49s 19.8% words/sec/thread: 1126442 lr:  0.801674 avg.loss:  0.124475 ETA:   0h 1m29s 65.4% words/sec/thread: 1125356 lr:  0.346496 avg.loss:  0.069721 ETA:   0h 0m38s 86.1% words/sec/thread: 1124540 lr:  0.138686 avg.loss:  0.059719 ETA:   0h 0m15s


CompletedProcess(args=['./fastText-0.9.2/fasttext', 'supervised', '-input', 'train.txt', '-output', 'langdetect', '-dim', '16', '-epoch', '15', '-lr', '1.0', '-loss', 'hs', '-minn', '2', '-maxn', '4', '-wordNgrams', '2'], returncode=0)

In [18]:
# Testing the improved model on the validation set
!fastText-0.9.2/fasttext test langdetect.bin valid.txt

# Testing the improved model on Kazakh sentences
!fastText-0.9.2/fasttext test langdetect.bin kazakh_test.txt

N	10000
P@1	0.976
R@1	0.976
N	4335
P@1	1
R@1	1
