In [1]:
import sys
sys.path.append('/home/trukhinmaksim/src')

In [2]:
import numpy as np
from time import time
from random import sample
from collections import defaultdict
from numpy import mean

In [3]:
from src.utils.DatabaseConnect import DatabaseConnect

# single machine setup (mongo is running here localy)
# "ip a" for ip address
MY_DATABASE_LINK = 'mongodb://172.26.234.237:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

DatabaseConnect.DB_LINK = MY_DATABASE_LINK

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['172.26.234.237:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [4]:
from src.utils.CacheAdapter import JSONAdapter, JSONMultiFileAdapter, EXP_END_OF_DATA
from src.utils.DatasetManager import ProjectsDatasetManager
from src.utils.validators import projectDataIsSufficient

In [5]:
def feedTextData(manager, batchSize = 1):
    # feeds text data by batches
    tempStorage = []

    i = 0
    while True:
        try:
            while len(tempStorage) >= batchSize:
                if batchSize > 1:
                    yield tempStorage[:batchSize]
                else:
                    yield tempStorage[:batchSize][0] # if I'm requesting only one item per time (training one by one) -> just yield it
                tempStorage = tempStorage[batchSize:]

            manager.fromDB()
            data = flatternData(manager.getTextOnly())
            tempStorage.extend(data)

            i += 1

        except EXP_END_OF_DATA:
            break

    yield tempStorage


In [6]:
def flatternData(data : dict[str, list]) -> np.array(dict):
    # takes in data in form of dict, where each key is a user id and each value is a list of that user's projects
    # returns just flat list of these projects 
    result = []

    for projectsArray in data.values():
        for project in projectsArray:
            result.append(project)

    return result

In [7]:
import gensim
from gensim.models.doc2vec import TaggedDocument

In [8]:
CACHE_FILE_NAME = "cache__02-04-2025__(good)_{0}.json"

In [9]:
# using adapter to load data from the cache files

# TODO: place implementation of the 'Corpus' class into a separate file
class Corpus:
    # base class for every data corpus, that will be used by model
    def __init__(self):
        pass
    def __iter__(self):
        pass
    def __getitem__(self, index : int):
        pass

class CacheCorpus(Corpus):
    def __init__(self, manager, cacheFileNameTemplate = CACHE_FILE_NAME):
        self.cacheFileNameTemplate = cacheFileNameTemplate
        self.manager = manager # manager is needed not only for interaction with adapter, but also if I want to use unpreprocessed dataset and preprocess it on the way
        
    def __iter__(self):
        # will feed preprocessed projects data as TaggedDocument instances one by one
        cacheFileName = self.cacheFileNameTemplate
        tempStorage = [] # temporary storage for data, that was read from files

        i = 0
        while True:
            try:
                while len(tempStorage) >= 1:
                    doc = tempStorage[0]
                    yield TaggedDocument(words = doc["tokens"], tags = doc["tags"])
                    tempStorage = tempStorage[1:]

                #self.manager.cacheAdapter.collectionName = cacheFileName.format(i)
                data = flatternData(self.manager.fromCache())
                tempStorage.extend(data)

                i += 1

            except EXP_END_OF_DATA:
            # no data left
                break


In [10]:
class EXP_FEEDER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.corpus' object must be an iterable structure, inherited from 'Corpus' class!")

class EXP_MANAGER_IS_NONE(Exception):
    def __init__(self):
        super().__init__("'Model.manager' object must be a DatasetManager instance!")


class Model(gensim.models.doc2vec.Doc2Vec):
    @staticmethod
    def create(**kwargs):
        model = Model(
            dm_dbow_mode = "DM", 
            pretrain_w2v = False,
            **kwargs
        )
        model.corpus = CacheCorpus(manager)

        return model
    
    def __init__(self, dm_dbow_mode = "DM", pretrain_w2v = False, alpha_init = 0.05, alpha_final = 0.001, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.corpus = None # corpus is an iterator(iterable class object), that will be used in "train" method of Doc2Vec model for data extraction
        self.alphaInit = alpha_init
        self.alphaFinal = alpha_final
        self.dmDbowMode = dm_dbow_mode
        self.pretrainW2V = pretrain_w2v
    
    def train(self):
        # will build vocabulary and train the model on corpus (corpus will be fed by corpus)
        import logging
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
        
        if not isinstance(self.corpus, Corpus): raise EXP_FEEDER_IS_NONE

        start = time()
        self.build_vocab(self.corpus)
        print(f"Vocabulary built in {time() - start} s")

        if self.dmDbowMode != "DM+DBOW":
            start = time()
            super().train(
                self.corpus, 
                total_examples = self.corpus_count, 
                epochs = self.epochs,
                start_alpha = self.alphaInit,
                end_alpha = self.alphaFinal
            )
            print(f"Training is completed in {time() - start} s")
        else:
            # combine DM and DBOW
            pass

    def assess(self, samples = list(), sampleNum = 5, silent = False):
        # simple test of model performance
        # take multiple documents from the training corpus and tries to find simillar in the dataset

        report = lambda s: if not silent: print(s)
        #performanceGrageScale = {50 : "Random", 60 : "Poor", 70 : "Bad", 80 : "Medium", 92 : "Optimal", 97 : "Perfect"}
        totalDocuments = self.corpus_count
        indexes = samples if len(samples) else sample(range(totalDocuments), sampleNum)
        stats = {}

        i = 0
        docs = []
        avgPerformances = []

        for doc in self.corpus:
            if i >= totalDocuments: break
            if i in indexes:
                vector = self.infer_vector(doc.words)
                sims = defaultdict(lambda: 0, self.dv.most_similar([vector], topn = totalDocuments))

                report(f"Assessing document {i} ({doc.tags}). Similarities by tags:")
                stats[i] = {
                    "similarities by tags" : {},
                    "average" : 0
                }

                for tag in doc.tags:
                    stats[i]["similarities by tags"][tag] = sims[tag]
                    report(f"  {tag} : {sims[tag]}")

                avgPerformances.append(mean([sims[tag] for tag in doc.tags]))
                report(f"\n  Average similarity value: {avgPerformances[-1]}\n")
                stats[i]["average"] = avgPerformances[-1]
            i += 1

        stats["Average accuracy"] = mean(avgPerformances)
        report(f"Average accuracy: {stats['Average accuracy']}")

        return stats

    def evaluate(self):
        # will train the model on upon-selected set of parameters and test it's performance
        # TODO: complete this method to evaluate the model
        pass

In [11]:
adapter = JSONMultiFileAdapter(CACHE_FILE_NAME)
#ProjectsDatasetManager.usersCollection = usersCollection
#ProjectsDatasetManager.projectsCollection = projectsCollection
manager = ProjectsDatasetManager(50, cacheAdapter = adapter)

In [None]:
from skopt.space import Real, Integer
from src.utils.AutoTuner import AutoTuner, Param

In [None]:
# creating model

VECTOR_SIZE = 170
EPOCHS_NUMBER = 20
WORD_MIN_COUNT = 5
WINDOW_SIZE = 7
NEGATIVE_SAMPLES_AMOUNT = 6
SUBSAMPLING_THRESHOLD = 1e-5
ALPHA_INIT = 0.05
ALTHA_FINAL = 0.00001
DM_DBOW_MODE = "DM" # "DBOW" "DM+DBOW"

# finetunning is done by twicking model parameters
model = Model(
    vector_size =  VECTOR_SIZE, 
    window =       WINDOW_SIZE, 
    min_count =    WORD_MIN_COUNT, 
    epochs =       EPOCHS_NUMBER, 
    dm_dbow_mode = DM_DBOW_MODE,
    negative =     NEGATIVE_SAMPLES_AMOUNT,
    sample =       SUBSAMPLING_THRESHOLD,
    alpha_init =   ALPHA_INIT,
    alpha_final =  ALTHA_FINAL
)
model.corpus = CacheCorpus(manager)
#model.assess()
model.train()
print(model.epochs)
#model.build_vocab(documentsCorpus)
#model.train(documentsCorpus, total_examples = model.corpus_count, epochs = model.epochs)

In [None]:
# autotunning model parameters

def createModel(**kwargs):
    model = Model(
                vector_size = VECTOR_SIZE,
                dm_dbow_mode = "DM", 
                alpha_init = ALPHA_INIT,
                alpha_final = ALPHA_FINAL,
                **kwargs
            )
    manager.adapter.reset()
    manager.clear()
    model.corpus = CacheCorpus(manager)

    return model

tuner = AutoTuner(createModel, [
    Param(_name = "window",    _type = Integer,  _range = (5, 9),       _initial = 7),
    Param(_name = "min_count", _type = Integer,  _range = (7, 12),      _initial = 7),
    Param(_name = "epochs",    _type = Integer,  _range = (25, 45),     _initial = 25),
    Param(_name = "negative",  _type = Integer,  _range = (5, 11),      _initial = 5),
    Param(_name = "sample",    _type = Real,     _range = (1e-5, 1e-6), _initial = 1e-5),
])

In [13]:
model.assess(5)

Assessing document 14764 (['github:rustdesk/rustdesk', 'rustdesk', 'Rust', 'anydesk', 'dart', 'flutter', 'flutter-apps', 'flutter-desktop', 'flutter-examples', 'flutter-mobile', 'flutter-ui', 'flutter-web', 'hacktoberfest', 'linux', 'p2p', 'rdp', 'remote-control', 'remote-desktop', 'rust', 'rust-lang', 'teamviewer', 'vnc', 'wayland']). Similarities by tags:
  github:rustdesk/rustdesk : 0.18297086656093597
  rustdesk : 0
  Rust : 0
  anydesk : 0
  dart : 0.17062386870384216
  flutter : 0.059013448655605316
  flutter-apps : 0
  flutter-desktop : 0.2565343677997589
  flutter-examples : 0.34016066789627075
  flutter-mobile : 0
  flutter-ui : 0.1642371267080307
  flutter-web : 0.05687282979488373
  hacktoberfest : 0.3369865417480469
  linux : 0.2085321545600891
  p2p : 0
  rdp : 0.28536391258239746
  remote-control : 0
  remote-desktop : 0
  rust : 0
  rust-lang : 0
  teamviewer : 0
  vnc : 0.2859153747558594
  wayland : 0.280897855758667

  Average similarity value: 0.11426560937062553

As

{14764: {'similarities by tags': {'github:rustdesk/rustdesk': 0.18297086656093597,
   'rustdesk': 0,
   'Rust': 0,
   'anydesk': 0,
   'dart': 0.17062386870384216,
   'flutter': 0.059013448655605316,
   'flutter-apps': 0,
   'flutter-desktop': 0.2565343677997589,
   'flutter-examples': 0.34016066789627075,
   'flutter-mobile': 0,
   'flutter-ui': 0.1642371267080307,
   'flutter-web': 0.05687282979488373,
   'hacktoberfest': 0.3369865417480469,
   'linux': 0.2085321545600891,
   'p2p': 0,
   'rdp': 0.28536391258239746,
   'remote-control': 0,
   'remote-desktop': 0,
   'rust': 0,
   'rust-lang': 0,
   'teamviewer': 0,
   'vnc': 0.2859153747558594,
   'wayland': 0.280897855758667},
  'average': 0.11426560937062553},
 32397: {'similarities by tags': {'github:Tran-Quyen/fe-practice-great-examples': 0.33979490399360657,
   'fe-practice-great-examples': 0.8404783010482788,
   'CSS': 0},
  'average': 0.3934244016806285},
 37909: {'similarities by tags': {'github:incubated-geek-cc/mindful-pred

In [14]:
i = 1
words = []
tags = []
for proj in model.corpus:
    if i <= 0: break
    words = proj.words
    tags = proj.tags
    i -= 1

print(tags)
vector = model.infer_vector(['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'])
vector

['github:glensc/nagios-plugin-check_raid', 'nagios-plugin-check_raid', 'Perl', 'icinga', 'icinga-plugin', 'monitoring', 'nagios', 'nagios-plugin', 'raid', 'sensu', 'sensu-plugin']


array([ 0.0920861 , -0.05375765, -0.08989536, -0.01302033,  0.06842194],
      dtype=float32)

In [15]:
model.dv.most_similar([vector], topn=len(model.dv))

[('reverseproxy', 0.9986116290092468),
 ('github:joerick/cibuildwheel', 0.9964580535888672),
 ('pacman', 0.9963006377220154),
 ('jsxToString-repo', 0.9959427118301392),
 ('github:johnnyzhang1992/starimgcn-react', 0.995678722858429),
 ('autoit-lang', 0.9956233501434326),
 ('CP-Buddy', 0.9955820441246033),
 ('github:gitsoufiane/react-hooks', 0.9954379796981812),
 ('github:SiraKen/learn-clojure', 0.9953967332839966),
 ('github:BugiDev/react-native-calendar-strip', 0.9946708679199219),
 ('start-to-storybook', 0.9941594004631042),
 ('github:Harry-Hopkinson/wordle', 0.9937389492988586),
 ('ballerina', 0.9935576319694519),
 ('algorythms', 0.9934346675872803),
 ('bookkeeping', 0.992898166179657),
 ('pathfinding-algorithm', 0.9928481578826904),
 ('ec21', 0.9922788143157959),
 ('msbuild', 0.9919800162315369),
 ('database-dump', 0.9919605851173401),
 ('PartsApp', 0.9919584393501282),
 ('swr-poc', 0.9915450215339661),
 ('github:mkdecisiondev/moqui-graphql', 0.9907367825508118),
 ('github:opexdev/O