In [1]:
import sys
sys.path.append('/home/trukhinmaksim/src')

In [2]:
import numpy as np

In [3]:
from src.utils.DatabaseConnect import DatabaseConnect

# single machine setup (mongo is running here localy)
# "ip a" for ip address
MY_DATABASE_LINK = 'mongodb://172.26.234.237:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

DatabaseConnect.DB_LINK = MY_DATABASE_LINK

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['172.26.234.237:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [4]:
from src.utils.CacheAdapter import JSONAdapter, JSONMultiFileAdapter, EXP_END_OF_DATA
from src.utils.DatasetManager import ProjectsDatasetManager
from src.utils.validators import projectDataIsSufficient

In [5]:
def feedTextData(manager, batchSize = 1):
    # feeds text data by batches
    tempStorage = []

    i = 0
    while True:
        try:
            while len(tempStorage) >= batchSize:
                if batchSize > 1:
                    yield tempStorage[:batchSize]
                else:
                    yield tempStorage[:batchSize][0] # if I'm requesting only one item per time (training one by one) -> just yield it
                tempStorage = tempStorage[batchSize:]

            manager.fromDB()
            data = flatternData(manager.getTextOnly())
            tempStorage.extend(data)

            i += 1

        except EXP_END_OF_DATA:
            break

    yield tempStorage


In [6]:
def flatternData(data : dict[str, list]) -> np.array(dict):
    # takes in data in form of dict, where each key is a user id and each value is a list of that user's projects
    # returns just flat list of these projects 
    result = []

    for projectsArray in data.values():
        for project in projectsArray:
            result.append(project)

    return result

In [7]:
import gensim
from gensim.models.doc2vec import TaggedDocument

In [8]:
CACHE_FILE_NAME = "cache__31-03-2025__(sufficient)_{0}.json"

In [9]:
# using adapter to load data from the cache files

# TODO: place implementation of the 'Corpus' class into a separate file
class Corpus:
    # base class for every data corpus, that will be used by model
    def __init__(self):
        pass
    def __iter__(self):
        pass
    def __getitem__(self, index : int):
        pass

# TODO: change to use adapter, that reads data from cache continously without need for changing collection name maually
class CacheCorpus(Corpus):
    def __init__(self, manager, cacheFileNameTemplate = CACHE_FILE_NAME):
        self.cacheFileNameTemplate = cacheFileNameTemplate
        self.manager = manager # manager is needed not only for interaction with adapter, but also if I want to use unpreprocessed dataset and preprocess it on the way
        
    def __iter__(self):
        # will feed preprocessed projects data as TaggedDocument instances one by one
        cacheFileName = self.cacheFileNameTemplate
        tempStorage = [] # temporary storage for data, that was read from files

        i = 0
        while True:
            try:
                while len(tempStorage) >= 1:
                    doc = tempStorage[0]
                    yield TaggedDocument(words = doc["tokens"], tags = doc["tags"])
                    tempStorage = tempStorage[1:]

                #self.manager.cacheAdapter.collectionName = cacheFileName.format(i)
                data = flatternData(self.manager.fromCache())
                tempStorage.extend(data)

                i += 1

            except EXP_END_OF_DATA:
            # no data left
                break


In [10]:
class EXP_FEEDER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.corpus' object must be an iterable structure, inherited from 'Corpus' class!")

class EXP_MANAGER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.manager' object must be a DatasetManager instance!")


class Model(gensim.models.doc2vec.Doc2Vec):
    def __init__(self, dm_dbow_mode = "DM", pretrain_w2v = False, alpha_init = 0.05, alpha_final = 0.001, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.corpus = None # corpus is an iterator(iterable class object), that will be used in "train" method of Doc2Vec model for data extraction
        self.alphaInit = alpha_init
        self.alphaFinal = alpha_final
        self.dmDbowMode = dm_dbow_mode
        self.pretrainW2V = pretrain_w2v
    
    def train(self):
        # will build vocabulary and train the model on corpus (corpus will be fed by corpus)
        import logging
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        if not isinstance(self.corpus, Corpus): raise EXP_FEEDER_IS_NONE

        self.build_vocab(self.corpus)

        if self.dmDbowMode != "DM+DBOW":
            super().train(
                self.corpus, 
                total_examples = self.corpus_count, 
                epochs = self.epochs,
                start_alpha = self.alphaInit,
                end_alpha = self.alphaFinal
            )
        else:
            # combine DM and DBOW
            pass

    def assess(self, sampleNum = 5):
        # simple test of model performance
        # take multiple documents from the training corpus and tries to find simillar in the dataset

        from random import sample
        from collections import defaultdict
        from numpy import mean

        performanceGrageScale = {50 : "Random", 60 : "Poor", 70 : "Bad", 80 : "Medium", 92 : "Optimal", 97 : "Perfect"}
        totalDocuments = self.corpus_count
        indexes = sample(range(totalDocuments), sampleNum)
        stats = {}

        i = 0
        docs = []
        avgPerformances = []

        for doc in self.corpus:
            if i >= totalDocuments: break
            if i in indexes:
                vector = self.infer_vector(doc.words)
                sims = defaultdict(lambda: 0, self.dv.most_similar([vector], topn = totalDocuments))

                print(f"Assessing document {i} ({doc.tags}). Similarities by tags:")
                stats[i] = {
                    "tags" : doc.tags,
                    "similarities by tags" : {},
                    "average" : 0
                }

                for tag in doc.tags:
                    stats[i]["similarities by tags"][tag] = sims[tag]
                    print(f"  {tag} : {sims[tag]}")

                avgPerformances.append(mean([sims[tag] for tag in doc.tags]))
                print(f"\n  Average similarity value: {avgPerformances[-1]}\n")
                stats[i]["average"] = avgPerformances[-1]
            i += 1

        stats["Average accuracy"] = mean(avgPerformances)
        print(f"Average accuracy: {stats['Average accuracy']}")
        
        return stats


In [11]:
adapter = JSONMultiFileAdapter(CACHE_FILE_NAME)
ProjectsDatasetManager.usersCollection = usersCollection
ProjectsDatasetManager.projectsCollection = projectsCollection
manager = ProjectsDatasetManager(25, validate = projectDataIsSufficient, cacheAdapter = adapter)

In [17]:
# creating model

VECTOR_SIZE = 5
EPOCHS_NUMBER = 5
WORD_MIN_COUNT = 3
WINDOW_SIZE = 3 # 7
NEGATIVE_SAMPLES_AMOUNT = 1
SUBSAMPLING_THRESHOLD = 1e-5
ALPHA_INIT = 0.05
ALTHA_FINAL = 0.00001
DM_DBOW_MODE = "DM" # "DBOW" "DM+DBOW"

# finetunning is done by twicking model parameters
model = Model(
    vector_size =  VECTOR_SIZE, 
    window =       WINDOW_SIZE, 
    min_count =    WORD_MIN_COUNT, 
    epochs =       EPOCHS_NUMBER, 
    dm_dbow_mode = DM_DBOW_MODE,
    negative =     NEGATIVE_SAMPLES_AMOUNT,
    sample =       SUBSAMPLING_THRESHOLD,
    alpha_init =   ALPHA_INIT,
    alpha_final =  ALTHA_FINAL
)
model.corpus = CacheCorpus(manager)
#model.assess()
model.train()
print(model.epochs)
#model.build_vocab(documentsCorpus)
#model.train(documentsCorpus, total_examples = model.corpus_count, epochs = model.epochs)

2025-04-03 12:37:23,601 : INFO : Model lifecycle event {'params': 'Model<dm/m,d5,n1,w3,mc3,s1e-05,t3>', 'datetime': '2025-04-03T12:37:23.601203', 'gensim': '4.3.3', 'python': '3.11.11 (main, Feb  4 2025, 07:29:35) [GCC 12.2.0]', 'platform': 'Linux-6.13.8-200.fc41.x86_64-x86_64-with-glibc2.36', 'event': 'created'}
2025-04-03 12:37:23,611 : INFO : collecting all words and their counts
2025-04-03 12:37:23,615 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2025-04-03 12:37:23,890 : INFO : PROGRESS: at example #10000, processed 90782 words (331824 words/s), 12220 word types, 24596 tags
2025-04-03 12:37:24,197 : INFO : collected 20813 word types and 46575 unique tags from a corpus of 19852 examples and 178966 words
2025-04-03 12:37:24,198 : INFO : Creating a fresh vocabulary
2025-04-03 12:37:24,218 : INFO : Model lifecycle event {'msg': 'effective_min_count=3 retains 6256 unique words (30.06% of original 20813, drops 14557)', 'datetime': '2025-04-03T12:

5


In [13]:
model.assess(3)

Assessing document 6830 (['github:github/docs', 'docs', 'TypeScript', 'docs', 'works-with-codespaces']). Similarities by tags:
  github:github/docs : 0.4754059910774231
  docs : 0
  TypeScript : 0
  docs : 0
  works-with-codespaces : 0.13332197070121765

  Average similarity value: 0.12174559235572815

Assessing document 12077 (['github:DefinitelyTyped/DefinitelyTyped', 'DefinitelyTyped', 'TypeScript', 'definition', 'dts', 'hacktoberfest', 'types', 'typescript', 'typescript-definitions', 'typings']). Similarities by tags:
  github:DefinitelyTyped/DefinitelyTyped : 0
  DefinitelyTyped : 0.2090863138437271
  TypeScript : 0
  definition : 0
  dts : 0.1452978104352951
  hacktoberfest : 0
  types : 0.21957048773765564
  typescript : 0
  typescript-definitions : 0.4334803521633148
  typings : 0

  Average similarity value: 0.10074349641799926

Assessing document 19743 (['github:jin0yoon/databinding-practice', 'databinding-practice', 'Java']). Similarities by tags:
  github:jin0yoon/databindi

{6830: {'tags': ['github:github/docs',
   'docs',
   'TypeScript',
   'docs',
   'works-with-codespaces'],
  'similarities by tags': {'github:github/docs': 0.4754059910774231,
   'docs': 0,
   'TypeScript': 0,
   'works-with-codespaces': 0.13332197070121765},
  'average': 0.12174559235572815},
 12077: {'tags': ['github:DefinitelyTyped/DefinitelyTyped',
   'DefinitelyTyped',
   'TypeScript',
   'definition',
   'dts',
   'hacktoberfest',
   'types',
   'typescript',
   'typescript-definitions',
   'typings'],
  'similarities by tags': {'github:DefinitelyTyped/DefinitelyTyped': 0,
   'DefinitelyTyped': 0.2090863138437271,
   'TypeScript': 0,
   'definition': 0,
   'dts': 0.1452978104352951,
   'hacktoberfest': 0,
   'types': 0.21957048773765564,
   'typescript': 0,
   'typescript-definitions': 0.4334803521633148,
   'typings': 0},
  'average': 0.10074349641799926},
 19743: {'tags': ['github:jin0yoon/databinding-practice',
   'databinding-practice',
   'Java'],
  'similarities by tags': {

In [14]:
i = 1
words = []
tags = []
for proj in model.corpus:
    if i <= 0: break
    words = proj.words
    tags = proj.tags
    i -= 1

print(tags)
vector = model.infer_vector(['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'])
vector

['github:lucasfloriani/awesome-code-streamers', 'awesome-code-streamers', '', 'awesome', 'awesome-list', 'lists', 'resources']


array([0.08131837, 0.06282104, 0.01109814, 0.05954585, 0.05984264],
      dtype=float32)

In [15]:
model.dv.most_similar([vector], topn=len(model.dv))

[('github:orther/terraform-aws-ecs-fargate-service', 0.9973738193511963),
 ('Kotlin-UI-Automation-The-Internet', 0.9931414127349854),
 ('kotlin-symbol-processor', 0.9924178123474121),
 ('ComponentizationStudy', 0.9913938641548157),
 ('github:cybercoder-naj/JavaScriptONLY', 0.990767776966095),
 ('twitter-splitter', 0.9906508326530457),
 ('speech-to-text', 0.9905177354812622),
 ('github:MalditaEs/embed', 0.9904927015304565),
 ('github:liqiang372/forming', 0.990449070930481),
 ('github:Manikant25/fullstack-course4', 0.989212155342102),
 ('KotlinObjectOrientedProgramming', 0.987947404384613),
 ('github:gitim/react-native-sortable-list', 0.9876710772514343),
 ('github:octahedroid/gatsby-remark-twitch', 0.9872346520423889),
 ('github:LinZong/ThreeActivityTransition', 0.9869368076324463),
 ('gamedevelopment', 0.9864965677261353),
 ('udacity-project-4', 0.9864069819450378),
 ('swift-learning', 0.9855234622955322),
 ('cashback-vanilla-js', 0.9852601885795593),
 ('Muzicx', 0.9845173954963684),
 