In [3]:
import fasttext
import pandas as pd
import numpy as np
from gensim.utils import simple_preprocess
import re, string

## Creating corpus to be used for pre-training FastText model
Corpus contains 100,000 randomly sampled messages from the GitHub Commit Messages Dataset: https://www.kaggle.com/datasets/dhruvildave/github-commit-messages-dataset

In [7]:
messages = pd.read_csv('commits_dataset.csv')
messages['message'] = messages['message'].astype(pd.StringDtype())

In [8]:
messages = messages['message'].dropna()
messages.head()

0    DOC: add example for plotting asymmetrical err...
1             Add keyword sort to pivot_table (#40954)
2     ENH: `Styler.highlight_quantile` method (#40926)
3    ENH: add `decimal` and `thousands` args to `St...
4    [ArrowStringArray] Use utf8_upper and utf8_low...
Name: message, dtype: string

In [9]:
messages.info()

<class 'pandas.core.series.Series'>
Int64Index: 4335970 entries, 0 to 4336298
Series name: message
Non-Null Count    Dtype 
--------------    ----- 
4335970 non-null  string
dtypes: string(1)
memory usage: 66.2 MB


In [10]:
message_test = pd.DataFrame(messages.sample(n=100000))

In [11]:
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

In [12]:
message_test['message'] = message_test['message'].apply(lambda x: preprocess(x))

In [13]:
#tokenization
message_test['message'] = message_test['message'].apply(lambda x: simple_preprocess(x))

In [14]:
message_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 1566412 to 402158
Data columns (total 1 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   message  100000 non-null  object
dtypes: object(1)
memory usage: 1.5+ MB


In [21]:
'''
with open('commit_corpus_100k.txt', 'w', encoding='UTF-8') as f:
    for message in message_test['message']:
        for s in message:
            f.write(s + ' ')
        f.write('\n')
    f.close()
'''

## Creating pretrained embeddings with corpus of 100,000 pre-processed commit messages

In [4]:
model = fasttext.train_unsupervised('commit_corpus_100k.txt',
                                    'cbow',
                                    dim=100,
                                    wordNgrams=2,
                                    thread = 6)

Read 5M words
Number of words:  28309
Number of labels: 0
Progress: 100.0% words/sec/thread:   94936 lr:  0.000000 avg.loss:  1.713304 ETA:   0h 0m 0s


## Converting model to vec file to be used in classification

In [6]:
model.save_model('fasttext_embeds.bin')

In [7]:
#convert bin to vec
from fasttext import load_model

# original BIN model loading
f = load_model('fasttext_embeds.bin')
lines=[]

# get all words from model
words = f.get_words()

with open('fasttext_embeds.vec','w') as file_out:
    
    # the first line must contain number of total words and vector dimension
    file_out.write(str(len(words)) + " " + str(f.get_dimension()) + "\n")

    # line by line, you append vectors to VEC file
    for w in words:
        v = f.get_word_vector(w)
        vstr = ""
        for vi in v:
            vstr += " " + str(vi)
        try:
            file_out.write(w + vstr+'\n')
        except:
            pass

