In [1]:
import sys
sys.path.append('/home/trukhinmaksim/src')

In [2]:
import numpy as np

In [3]:
from src.utils.DatabaseConnect import DatabaseConnect

# single machine setup (mongo is running here localy)
# "ip a" for ip address
MY_DATABASE_LINK = 'mongodb://172.26.234.237:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

DatabaseConnect.DB_LINK = MY_DATABASE_LINK

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['172.26.234.237:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [4]:
from src.utils.CacheAdapter import JSONAdapter, JSONMultiFileAdapter, EXP_END_OF_DATA
from src.utils.DatasetManager import ProjectsDatasetManager
from src.utils.validators import projectDataIsSufficient

In [5]:
def flatternData(data : dict[str, list]) -> np.array(dict):
    # takes in data in form of dict, where each key is a user id and each value is a list of that user's projects
    # returns just flat list of these projects 
    result = []

    for projectsArray in data.values():
        for project in projectsArray:
            result.append(project)

    return result

In [6]:
import gensim
from gensim.models.doc2vec import TaggedDocument

In [7]:
CACHE_FILE_NAME = "cache__31-03-2025__(sufficient)_{index}.json"

In [8]:
# using adapter to load data from the cache files

class DataFeeder(ProjectsDatasetManager):
    def __init__(self, batchSize = 32):
        #super().__init__(batchSize, validator, adapter)
        self.batchSize = batchSize

    def feedFromCache(self, cacheAdapter):
        # generator function, that will use "JSONMultifileAdapter" to parse data from multiple cache file, collect projects into a list and feed to the model
        tempStorage = []
        #self.cacheAdapter = cacheAdapter

        i = 0
        while True:
            try:
                while len(tempStorage) >= self.batchSize:
                    yield tempStorage[:self.batchSize]
                    tempStorage = tempStorage[self.batchSize:]

                data = cacheAdapter.load(1) # load users one by one
                data = flatternData(data)
                tempStorage.extend(data)

                i += 1

            except EXP_END_OF_DATA:
                break

        yield tempStorage

def feedProjectsFromCache(manager, batchSize = 32):
    cacheFileName = CACHE_FILE_NAME
    tempStorage = []

    i = 0
    while True:
        try:
            while len(tempStorage) >= batchSize:
                yield tempStorage[:batchSize]
                tempStorage = tempStorage[batchSize:]

            manager.cacheAdapter.collectionName = cacheFileName.format(index = i)
            data = flatternData(manager.fromCache())
            tempStorage.extend(data)

            i += 1

        except EXP_END_OF_DATA:
            # no data left
            break

    yield tempStorage

class Corpus:
    # base class for every data corpus, that will be used by model
    def __init__(self):
        pass
    def __iter__(self):
        pass
    def __getitem__(self, index : int):
        pass

class CacheCorpus(Corpus):
    def __init__(self, manager, cacheFileNameTemplate = CACHE_FILE_NAME):
        self.cacheFileNameTemplate = cacheFileNameTemplate
        self.manager = manager
        
    def __iter__(self):
        # will feed preprocessed projects data as TaggedDocument instances one by one
        cacheFileName = self.cacheFileNameTemplate
        tempStorage = [] # temporary storage for data, that was read from files

        i = 0
        while True:
            try:
                while len(tempStorage) >= 1:
                    doc = tempStorage[0]
                    yield TaggedDocument(words = doc["tokens"], tags = doc["tags"])
                    tempStorage = tempStorage[1:]

                self.manager.cacheAdapter.collectionName = cacheFileName.format(index = i)
                data = flatternData(self.manager.fromCache())
                tempStorage.extend(data)

                i += 1

            except EXP_END_OF_DATA:
            # no data left
                break




In [9]:
adapter = JSONAdapter()
ProjectsDatasetManager.usersCollection = usersCollection
ProjectsDatasetManager.projectsCollection = projectsCollection
manager = ProjectsDatasetManager(25, validate = projectDataIsSufficient, cacheAdapter = adapter)

In [10]:
"""
manager.fromDB()
textData = manager.getTextOnly()
for text in flatternData(textData):
    print(text)
    #print(flatternData(data))
    #print(text)
"""
def feedTextData(manager, batchSize = 1):
    # feeds text data by batches
    tempStorage = []

    i = 0
    while True:
        try:
            while len(tempStorage) >= batchSize:
                if batchSize > 1:
                    yield tempStorage[:batchSize]
                else:
                    yield tempStorage[:batchSize][0] # if I'm requesting only one item per time (training one by one) -> just yield it
                tempStorage = tempStorage[batchSize:]

            manager.fromDB()
            data = flatternData(manager.getTextOnly())
            tempStorage.extend(data)

            i += 1

        except EXP_END_OF_DATA:
            break

    yield tempStorage



In [11]:
"""
i = 2
corp = CacheCorpus(manager)
for item in corp:
    if i == 0: break
    print(item)
    i -= 1
"""

'\ni = 2\ncorp = CacheCorpus(manager)\nfor item in corp:\n    if i == 0: break\n    print(item)\n    i -= 1\n'

In [12]:
class EXP_FEEDER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.corpus' object must be an iterable structure, inherited from 'Corpus' class!")

class EXP_MANAGER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.manager' object must be a DatasetManager instance!")


class Model(gensim.models.doc2vec.Doc2Vec):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.corpus = None # corpus is an iterator(iterable class object), that will be used in "train" method of Doc2Vec model for data extraction
        self.manager = None
    
    def train(self):
        # will build vocabulary and train the model on corpus (corpus will be fed by corpus)
        import logging
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        if not isinstance(self.corpus, Corpus): raise EXP_FEEDER_IS_NONE
        #if not isinstance(self.manager, ProjectsDatasetManager) : raise EXP_MANAGER_IS_NONE

        self.build_vocab(self.corpus)

        super().train(self.corpus, total_examples=self.corpus_count, epochs=self.epochs)

    def assess(self):
        pass


In [13]:
"""
documentsCorpus = []

for project in manager.data:
    documentsCorpus.append(TaggedDocument(words=project["tokens"], tags=project["tags"]))

documentsCorpus
"""

'\ndocumentsCorpus = []\n\nfor project in manager.data:\n    documentsCorpus.append(TaggedDocument(words=project["tokens"], tags=project["tags"]))\n\ndocumentsCorpus\n'

In [14]:
# creating model

VECTOR_SIZE = 100
EPOCH_NUMBER = 10
WORD_MIN_AMOUNT = 3
WINDOW_SIZE = 7

model = Model(vector_size = VECTOR_SIZE, window = WINDOW_SIZE, min_count = WORD_MIN_AMOUNT, epochs = EPOCH_NUMBER)
model.corpus = CacheCorpus(manager)
model.train()
print(model.epochs)
#model.build_vocab(documentsCorpus)
#model.train(documentsCorpus, total_examples = model.corpus_count, epochs = model.epochs)

2025-04-02 08:42:33,480 : INFO : collecting all words and their counts
2025-04-02 08:42:33,486 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2025-04-02 08:42:33,671 : INFO : PROGRESS: at example #10000, processed 90782 words (500403 words/s), 12220 word types, 24596 tags
2025-04-02 08:42:34,147 : INFO : collected 20813 word types and 46575 unique tags from a corpus of 19852 examples and 178966 words
2025-04-02 08:42:34,148 : INFO : Creating a fresh vocabulary
2025-04-02 08:42:34,184 : INFO : Model lifecycle event {'msg': 'effective_min_count=3 retains 6256 unique words (30.06% of original 20813, drops 14557)', 'datetime': '2025-04-02T08:42:34.184280', 'gensim': '4.3.3', 'python': '3.11.11 (main, Feb  4 2025, 07:29:35) [GCC 12.2.0]', 'platform': 'Linux-6.13.8-200.fc41.x86_64-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
2025-04-02 08:42:34,186 : INFO : Model lifecycle event {'msg': 'effective_min_count=3 leaves 160718 word corpus (89.80% of or

10


In [25]:
i = 1
words = []
tags = []
for proj in model.corpus:
    if i <= 0: break
    words = proj.words
    tags = proj.tags
    i -= 1

print(tags)
vector = model.infer_vector(['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'])
vector

['github:lucasfloriani/awesome-code-streamers', 'awesome-code-streamers', '', 'awesome', 'awesome-list', 'lists', 'resources']


array([-0.07697501,  0.00090919,  0.01700639,  0.00578002,  0.01141405,
        0.04065548,  0.04338071, -0.02093355, -0.02708795, -0.02501111,
       -0.05977037, -0.00149107, -0.03606845,  0.07533808,  0.06401943,
       -0.05219866, -0.03728019, -0.0886542 ,  0.05544928, -0.01964978,
       -0.01083034, -0.01184538,  0.03563146,  0.06505623,  0.00862272,
       -0.0398781 , -0.03055311,  0.05893732, -0.01128467, -0.01422103,
       -0.08515907,  0.04617144,  0.01707039,  0.07883204,  0.0035496 ,
        0.08765335,  0.02133695, -0.05843817,  0.05180668, -0.09003507,
        0.04678525, -0.0276979 , -0.01949998,  0.02562637, -0.10886063,
       -0.00514436, -0.02243493, -0.01411657,  0.0449055 ,  0.06223707,
        0.00301906,  0.01258529, -0.02442262,  0.05707555,  0.03562714,
       -0.08039601, -0.09243574,  0.02844349, -0.06473155,  0.03280161,
        0.00976344,  0.01631724, -0.01657622, -0.08422894, -0.02789286,
        0.07224481,  0.06430259,  0.027457  ,  0.02525555,  0.01

In [26]:
model.dv.most_similar([vector], topn=len(model.dv))

[('profile-readme', 0.8977855443954468),
 ('awesome-github-profile-readme', 0.8938932418823242),
 ('readme-md', 0.8757578730583191),
 ('readme-dynamic', 0.8723273873329163),
 ('lambda-functions', 0.8723084926605225),
 ('cloudformation', 0.8706826567649841),
 ('readme-generator', 0.8695669770240784),
 ('github-readme', 0.8663167953491211),
 ('awesome-dynamic-readme', 0.8607762455940247),
 ('manager', 0.8588715195655823),
 ('github-profile-readme', 0.8571562767028809),
 ('intersection', 0.857123076915741),
 ('bash-scripting', 0.8565709590911865),
 ('shell-scripting', 0.8561950325965881),
 ('bash-commands', 0.8552331924438477),
 ('github:fatihacet/turkcekaynaklar-com', 0.8528872728347778),
 ('turkish', 0.8523991703987122),
 ('awesome-bash-commands', 0.8516043424606323),
 ('helper', 0.851505696773529),
 ('kaynaklar', 0.8504897356033325),
 ('graalvm', 0.8495941758155823),
 ('feedback-form', 0.8488668203353882),
 ('github:cucerdariancatalin/HarryPotter', 0.848709225654602),
 ('turkcekaynakla