In [1]:
import sys
sys.path.append('/home/trukhinmaksim/src')

In [2]:
import numpy as np

In [3]:
from src.utils.DatabaseConnect import DatabaseConnect

# single machine setup (mongo is running here localy)
# "ip a" for ip address
MY_DATABASE_LINK = 'mongodb://172.26.234.237:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

DatabaseConnect.DB_LINK = MY_DATABASE_LINK

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['172.26.234.237:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [4]:
from src.utils.CacheAdapter import JSONAdapter, JSONMultiFileAdapter, EXP_END_OF_DATA
from src.utils.DatasetManager import ProjectsDatasetManager
from src.utils.validators import projectDataIsSufficient

In [10]:
def feedTextData(manager, batchSize = 1):
    # feeds text data by batches
    tempStorage = []

    i = 0
    while True:
        try:
            while len(tempStorage) >= batchSize:
                if batchSize > 1:
                    yield tempStorage[:batchSize]
                else:
                    yield tempStorage[:batchSize][0] # if I'm requesting only one item per time (training one by one) -> just yield it
                tempStorage = tempStorage[batchSize:]

            manager.fromDB()
            data = flatternData(manager.getTextOnly())
            tempStorage.extend(data)

            i += 1

        except EXP_END_OF_DATA:
            break

    yield tempStorage


In [5]:
def flatternData(data : dict[str, list]) -> np.array(dict):
    # takes in data in form of dict, where each key is a user id and each value is a list of that user's projects
    # returns just flat list of these projects 
    result = []

    for projectsArray in data.values():
        for project in projectsArray:
            result.append(project)

    return result

In [6]:
import gensim
from gensim.models.doc2vec import TaggedDocument

In [7]:
CACHE_FILE_NAME = "cache__31-03-2025__(sufficient)_{index}.json"

In [8]:
# using adapter to load data from the cache files

# TODO: place implementation of the 'Corpus' class into a separate file
class Corpus:
    # base class for every data corpus, that will be used by model
    def __init__(self):
        pass
    def __iter__(self):
        pass
    def __getitem__(self, index : int):
        pass

# TODO: change to use adapter, that reads data from cache continously without need for changing collection name maually
class CacheCorpus(Corpus):
    def __init__(self, manager, cacheFileNameTemplate = CACHE_FILE_NAME):
        self.cacheFileNameTemplate = cacheFileNameTemplate
        self.manager = manager # manager is needed not only for interaction with adapter, but also if I want to use unpreprocessed dataset and preprocess it on the way
        
    def __iter__(self):
        # will feed preprocessed projects data as TaggedDocument instances one by one
        cacheFileName = self.cacheFileNameTemplate
        tempStorage = [] # temporary storage for data, that was read from files

        i = 0
        while True:
            try:
                while len(tempStorage) >= 1:
                    doc = tempStorage[0]
                    yield TaggedDocument(words = doc["tokens"], tags = doc["tags"])
                    tempStorage = tempStorage[1:]

                self.manager.cacheAdapter.collectionName = cacheFileName.format(index = i)
                data = flatternData(self.manager.fromCache())
                tempStorage.extend(data)

                i += 1

            except EXP_END_OF_DATA:
            # no data left
                break


In [65]:
class EXP_FEEDER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.corpus' object must be an iterable structure, inherited from 'Corpus' class!")

class EXP_MANAGER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.manager' object must be a DatasetManager instance!")


class Model(gensim.models.doc2vec.Doc2Vec):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.corpus = None # corpus is an iterator(iterable class object), that will be used in "train" method of Doc2Vec model for data extraction

    def train(self):
        # will build vocabulary and train the model on corpus (corpus will be fed by corpus)
        import logging
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        if not isinstance(self.corpus, Corpus): raise EXP_FEEDER_IS_NONE

        self.build_vocab(self.corpus)

        super().train(
            self.corpus, 
            total_examples = self.corpus_count, 
            epochs = self.epochs
        )

    def assess(self, sampleNum = 5):
        # simple test of model performance
        # take multiple documents from the training corpus and tries to find simillar in the dataset

        from random import sample
        from collections import defaultdict
        from numpy import mean

        totalDocuments = self.corpus_count
        indexes = sample(range(totalDocuments), sampleNum)

        i = 0
        docs = []

        for doc in self.corpus:
            if i >= totalDocuments: break
            if i in indexes:
                vector = self.infer_vector(doc.words)
                sims = defaultdict(lambda: 0, self.dv.most_similar([vector], topn = totalDocuments))

                print(f"Assessing document {i} ({doc.tags}). Similarities by tags:")

                for tag in doc.tags:
                    print(f"  {tag} : {sims[tag]}")

                print(f"\n  Average similarity value: {mean([sims[tag] for tag in doc.tags])}\n")
            i += 1


In [9]:
adapter = JSONAdapter()
ProjectsDatasetManager.usersCollection = usersCollection
ProjectsDatasetManager.projectsCollection = projectsCollection
manager = ProjectsDatasetManager(25, validate = projectDataIsSufficient, cacheAdapter = adapter)

In [66]:
# creating model

VECTOR_SIZE = 5
EPOCH_NUMBER = 1
WORD_MIN_AMOUNT = 3
WINDOW_SIZE = 3 # 7

model = Model(vector_size = VECTOR_SIZE, window = WINDOW_SIZE, min_count = WORD_MIN_AMOUNT, epochs = EPOCH_NUMBER)
model.corpus = CacheCorpus(manager)
#model.assess()
model.train()
print(model.epochs)
#model.build_vocab(documentsCorpus)
#model.train(documentsCorpus, total_examples = model.corpus_count, epochs = model.epochs)

2025-04-02 13:23:23,186 : INFO : Model lifecycle event {'params': 'Model<dm/m,d5,n5,w3,mc3,s0.001,t3>', 'datetime': '2025-04-02T13:23:23.186760', 'gensim': '4.3.3', 'python': '3.11.11 (main, Feb  4 2025, 07:29:35) [GCC 12.2.0]', 'platform': 'Linux-6.13.8-200.fc41.x86_64-x86_64-with-glibc2.36', 'event': 'created'}
2025-04-02 13:23:23,197 : INFO : collecting all words and their counts
2025-04-02 13:23:23,201 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2025-04-02 13:23:23,472 : INFO : PROGRESS: at example #10000, processed 90782 words (336934 words/s), 12220 word types, 24596 tags
2025-04-02 13:23:23,764 : INFO : collected 20813 word types and 46575 unique tags from a corpus of 19852 examples and 178966 words
2025-04-02 13:23:23,765 : INFO : Creating a fresh vocabulary
2025-04-02 13:23:23,786 : INFO : Model lifecycle event {'msg': 'effective_min_count=3 retains 6256 unique words (30.06% of original 20813, drops 14557)', 'datetime': '2025-04-02T13:

1


In [67]:
model.assess(3)

Assessing document 6711 (['github:swanson/swanson.github.com', 'swanson.github.com', 'JavaScript', 'blog', 'jekyll']). Similarities by tags:
  github:swanson/swanson.github.com : 0.32625705003738403
  swanson.github.com : 0
  JavaScript : 0.8020398020744324
  blog : 0.8252984285354614
  jekyll : 0.7698740363121033

  Average similarity value: 0.5446938633918762

Assessing document 12031 (['github:MohFahmi27/pariwisata_tourism_balikpapan', 'pariwisata_tourism_balikpapan', 'JavaScript']). Similarities by tags:
  github:MohFahmi27/pariwisata_tourism_balikpapan : 0.14818449318408966
  pariwisata_tourism_balikpapan : 0.4639551639556885
  JavaScript : 0

  Average similarity value: 0.20404655237992605

Assessing document 15304 (['github:Hell13Cat/Minecraft-Behavior-MW', 'IMoO', 'Python']). Similarities by tags:
  github:Hell13Cat/Minecraft-Behavior-MW : 0
  IMoO : 0
  Python : 0.683768093585968

  Average similarity value: 0.22792269786198935



In [40]:
i = 1
words = []
tags = []
for proj in model.corpus:
    if i <= 0: break
    words = proj.words
    tags = proj.tags
    i -= 1

print(tags)
vector = model.infer_vector(['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'])
vector

['github:lucasfloriani/awesome-code-streamers', 'awesome-code-streamers', '', 'awesome', 'awesome-list', 'lists', 'resources']


array([ 0.00190285,  0.1701992 , -0.00449251,  0.0121613 , -0.11262569],
      dtype=float32)

In [None]:
model.dv.most_similar([vector], topn=len(model.dv))