In [1]:
import sys
sys.path.append('/home/trukhinmaksim/src')

In [2]:
import numpy as np

In [3]:
from src.utils.DatabaseConnect import DatabaseConnect

# single machine setup (mongo is running here localy)
# "ip a" for ip address
MY_DATABASE_LINK = 'mongodb://172.26.234.237:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

DatabaseConnect.DB_LINK = MY_DATABASE_LINK

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['172.26.234.237:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [4]:
from src.utils.CacheAdapter import JSONAdapter, JSONMultiFileAdapter, EXP_END_OF_DATA
from src.utils.DatasetManager import ProjectsDatasetManager
from src.utils.validators import projectDataIsSufficient

In [5]:
def flatternData(data : dict[str, list]) -> np.array(dict):
    # takes in data in form of dict, where each key is a user id and each value is a list of that user's projects
    # returns just flat list of these projects 
    result = []

    for projectsArray in data.values():
        for project in projectsArray:
            result.append(project)

    return result

In [6]:
import gensim
from gensim.models.doc2vec import TaggedDocument

In [7]:
CACHE_FILE_NAME = "cache__31-03-2025__(sufficient)_{index}.json"

In [8]:
# using adapter to load data from the cache files

class DataFeeder(ProjectsDatasetManager):
    def __init__(self, batchSize = 32):
        #super().__init__(batchSize, validator, adapter)
        self.batchSize = batchSize

    def feedFromCache(self, cacheAdapter):
        # generator function, that will use "JSONMultifileAdapter" to parse data from multiple cache file, collect projects into a list and feed to the model
        tempStorage = []
        #self.cacheAdapter = cacheAdapter

        i = 0
        while True:
            try:
                while len(tempStorage) >= self.batchSize:
                    yield tempStorage[:self.batchSize]
                    tempStorage = tempStorage[self.batchSize:]

                data = cacheAdapter.load(1) # load users one by one
                data = flatternData(data)
                tempStorage.extend(data)

                i += 1

            except EXP_END_OF_DATA:
                break

        yield tempStorage

def feedProjectsFromCache(manager, batchSize = 32):
    cacheFileName = CACHE_FILE_NAME
    tempStorage = []

    i = 0
    while True:
        try:
            while len(tempStorage) >= batchSize:
                yield tempStorage[:batchSize]
                tempStorage = tempStorage[batchSize:]

            manager.cacheAdapter.collectionName = cacheFileName.format(index = i)
            data = flatternData(manager.fromCache())
            tempStorage.extend(data)

            i += 1

        except EXP_END_OF_DATA:
            # no data left
            break

    yield tempStorage

class Corpus:
    # base class for every data corpus, that will be used by model
    def __init__(self):
        pass
    def __iter__(self):
        pass
    def __getitem__(self, index : int):
        pass

class CacheCorpus(Corpus):
    def __init__(self, manager, cacheFileNameTemplate = CACHE_FILE_NAME):
        self.cacheFileNameTemplate = cacheFileNameTemplate
        self.manager = manager
        
    def __iter__(self):
        # will feed preprocessed projects data as TaggedDocument instances one by one
        cacheFileName = self.cacheFileNameTemplate
        tempStorage = [] # temporary storage for data, that was read from files

        i = 0
        while True:
            try:
                while len(tempStorage) >= 1:
                    doc = tempStorage[0]
                    yield TaggedDocument(words = doc["tokens"], tags = doc["tags"])
                    tempStorage = tempStorage[1:]

                self.manager.cacheAdapter.collectionName = cacheFileName.format(index = i)
                data = flatternData(self.manager.fromCache())
                tempStorage.extend(data)

                i += 1

            except EXP_END_OF_DATA:
            # no data left
                break




In [9]:
adapter = JSONAdapter()
ProjectsDatasetManager.usersCollection = usersCollection
ProjectsDatasetManager.projectsCollection = projectsCollection
manager = ProjectsDatasetManager(25, validate = projectDataIsSufficient, cacheAdapter = adapter)

In [10]:
"""
manager.fromDB()
textData = manager.getTextOnly()
for text in flatternData(textData):
    print(text)
    #print(flatternData(data))
    #print(text)
"""
def feedTextData(manager, batchSize = 1):
    # feeds text data by batches
    tempStorage = []

    i = 0
    while True:
        try:
            while len(tempStorage) >= batchSize:
                if batchSize > 1:
                    yield tempStorage[:batchSize]
                else:
                    yield tempStorage[:batchSize][0] # if I'm requesting only one item per time (training one by one) -> just yield it
                tempStorage = tempStorage[batchSize:]

            manager.fromDB()
            data = flatternData(manager.getTextOnly())
            tempStorage.extend(data)

            i += 1

        except EXP_END_OF_DATA:
            break

    yield tempStorage



In [11]:
"""
i = 2
corp = CacheCorpus(manager)
for item in corp:
    if i == 0: break
    print(item)
    i -= 1
"""

'\ni = 2\ncorp = CacheCorpus(manager)\nfor item in corp:\n    if i == 0: break\n    print(item)\n    i -= 1\n'

In [16]:
class EXP_FEEDER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.corpus' object must be an iterable structure, inherited from 'Corpus' class!")

class EXP_MANAGER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.manager' object must be a DatasetManager instance!")


class Model(gensim.models.doc2vec.Doc2Vec):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.corpus = None # corpus is an iterator(iterable class object), that will be used in "train" method of Doc2Vec model for data extraction
        self.manager = None
    
    def train(self):
        # will build vocabulary and train the model on corpus (corpus will be fed by corpus)
        import logging
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        if not isinstance(self.corpus, Corpus): raise EXP_FEEDER_IS_NONE
        #if not isinstance(self.manager, ProjectsDatasetManager) : raise EXP_MANAGER_IS_NONE

        self.build_vocab(self.corpus)

        super().train(self.corpus, total_examples=self.corpus_count, epochs=self.epochs)

    def assess(self):
        pass


In [13]:
"""
documentsCorpus = []

for project in manager.data:
    documentsCorpus.append(TaggedDocument(words=project["tokens"], tags=project["tags"]))

documentsCorpus
"""

'\ndocumentsCorpus = []\n\nfor project in manager.data:\n    documentsCorpus.append(TaggedDocument(words=project["tokens"], tags=project["tags"]))\n\ndocumentsCorpus\n'

In [17]:
# creating model

VECTOR_SIZE = 100
EPOCH_NUMBER = 10
WORD_MIN_AMOUNT = 3
WINDOW_SIZE = 7

model = Model(vector_size = VECTOR_SIZE, window = WINDOW_SIZE, min_count = WORD_MIN_AMOUNT, epochs = EPOCH_NUMBER)
model.corpus = CacheCorpus(manager)
model.train()
print(model.epochs)
#model.build_vocab(documentsCorpus)
#model.train(documentsCorpus, total_examples = model.corpus_count, epochs = model.epochs)

2025-04-02 08:24:48,050 : INFO : collecting all words and their counts
2025-04-02 08:24:48,055 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2025-04-02 08:24:48,275 : INFO : PROGRESS: at example #10000, processed 90782 words (414567 words/s), 12220 word types, 24596 tags
2025-04-02 08:24:48,991 : INFO : collected 20813 word types and 46575 unique tags from a corpus of 19852 examples and 178966 words
2025-04-02 08:24:48,993 : INFO : Creating a fresh vocabulary
2025-04-02 08:24:49,033 : INFO : Model lifecycle event {'msg': 'effective_min_count=3 retains 6256 unique words (30.06% of original 20813, drops 14557)', 'datetime': '2025-04-02T08:24:49.033606', 'gensim': '4.3.3', 'python': '3.11.11 (main, Feb  4 2025, 07:29:35) [GCC 12.2.0]', 'platform': 'Linux-6.13.8-200.fc41.x86_64-x86_64-with-glibc2.36', 'event': 'prepare_vocab'}
2025-04-02 08:24:49,035 : INFO : Model lifecycle event {'msg': 'effective_min_count=3 leaves 160718 word corpus (89.80% of or

10


In [15]:

vector = model.infer_vector(['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'])
vector

array([ 0.00308863, -0.00735241, -0.02905755, -0.00087031, -0.01099693,
        0.02082823,  0.02023694, -0.00334258, -0.00799487, -0.00433645,
       -0.02051825,  0.00601316, -0.00980301,  0.01860038,  0.03266804,
       -0.0304384 ,  0.00677712, -0.03670615,  0.01083722, -0.02049837,
        0.00231234, -0.00871367,  0.00825047,  0.01820921,  0.00338148,
        0.00953344, -0.01728403, -0.0047023 , -0.04001134,  0.004596  ,
       -0.0296433 ,  0.01734056,  0.023515  ,  0.02000066, -0.02267226,
        0.01982907,  0.03472478, -0.0253478 ,  0.00523992, -0.02408287,
        0.02872182,  0.00134164, -0.02194417,  0.02691794, -0.03505132,
        0.00605899, -0.00352153, -0.00400082,  0.02695579,  0.0387282 ,
       -0.02189158, -0.00072087, -0.00812104,  0.00809368, -0.01303041,
       -0.01916484, -0.01470063, -0.00090048, -0.03214884,  0.00935926,
        0.00322564, -0.01434393, -0.00888153, -0.00861587, -0.02469372,
        0.02592297,  0.0189695 ,  0.0325394 ,  0.01361089,  0.01