In [1]:
import sys
sys.path.append('/home/trukhinmaksim/src')

In [3]:
import numpy as np

In [2]:
from src.utils.DatabaseConnect import DatabaseConnect

# single machine setup (mongo is running here localy)
# "ip a" for ip address
MY_DATABASE_LINK = 'mongodb://10.22.112.39:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

DatabaseConnect.DB_LINK = MY_DATABASE_LINK

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['10.22.112.39:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [4]:
def findApproximateStarsThreshold(projectsNum : int, percentile=50):
    count = projectsNum
    cursor = projectsCollection.find()
    stars = []

    for proj in cursor:
        if count <= 0: break
        stars.append(proj["stars"])
        count -= 1

    stars = np.array(stars)
    print(f"Scanned {projectsNum - count} projects")
    print(f"Maximum stars amount: {stars.max()}")

    threshold = np.percentile(stars, percentile)
    print(f"Number of projects, that has more stars than threshold: {len(stars[stars > threshold])}")

    return threshold

# Keep top 30% of projects
srarsThreshold = findApproximateStarsThreshold(2000, 75)
print(f"Stars threshold: {srarsThreshold}")

Scanned 2000 projects
Maximum stars amount: 284894
Number of projects, that has more stars than threshold: 476
Stars threshold: 2.0


In [5]:
from src.utils.CacheAdapter import JSONAdapter
from src.utils.DatasetManager import ProjectsDatasetManager

In [6]:
# Validators are used to filter data by quality, 
# for example, I can take only those project, that has long description, readme file and many stars

def projectDataIsSufficient(projectData):
    # filters sufficient data (has description and one(or both) of topics or language)
    return (projectData and projectData["description"] and (len(projectData["topics"]) or projectData["language"]))

USERS_NUMBER_TO_SCAN = 2

adapter = JSONAdapter()
ProjectsDatasetManager.usersCollection = usersCollection
ProjectsDatasetManager.projectsCollection = projectsCollection

In [7]:
manager = ProjectsDatasetManager(USERS_NUMBER_TO_SCAN, validate = projectDataIsSufficient, cacheAdapter = adapter)

In [8]:
manager.fromDB()
manager.preprocess()
manager.data

Scanning user: 0
Scanning user: 1


{'github:betiol': array([{'tokens': ['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'], 'tags': ['github:lucasfloriani/awesome-code-streamers', 'awesome-code-streamers', '', 'awesome', 'awesome-list', 'lists', 'resources']},
        {'tokens': ['smart', 'home', 'mqtt', 'experimental', 'wip', 'bridge', 'google', 'smart', 'home', 'mqtt'], 'tags': ['github:alvarowolfx/smart-home-mqtt', 'smart-home-mqtt', 'JavaScript']},
        {'tokens': ['typeorm', 'seeding', 'delightful', 'way', 'seed', 'test', 'data', 'database'], 'tags': ['github:betiol/typeorm-seeding', 'typeorm-seeding', 'TypeScript']},
        {'tokens': ['express', 'response', 'formatter', 'sparkle', 'better', 'way', 'format', 'express', 'response'], 'tags': ['github:betiol/express-response-formatter', 'express-response-formatter', 'TypeScript']},
        {'tokens': ['desafio', 'rn', 'desafio', 'react', 'native'], 'tags': ['github:betiol/desafio-rn', 'desafio-rn', 'T

In [9]:
import gensim
from gensim.models.doc2vec import TaggedDocument

In [10]:
def flatternData(data : dict[str, list]) -> np.array(dict):
    # takes in data in form of dict, where each key is a user id and each value is a list of that user's projects
    # returns just flat list of these projects 
    result = []

    for projectsArray in data.values():
        for project in projectsArray:
            result.append(project)

    return np.array(result)

manager.data = flatternData(manager.data)

In [11]:
documentsCorpus = []

for project in manager.data:
    documentsCorpus.append(TaggedDocument(words=project["tokens"], tags=project["tags"]))

documentsCorpus

[TaggedDocument(words=['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'], tags=['github:lucasfloriani/awesome-code-streamers', 'awesome-code-streamers', '', 'awesome', 'awesome-list', 'lists', 'resources']),
 TaggedDocument(words=['smart', 'home', 'mqtt', 'experimental', 'wip', 'bridge', 'google', 'smart', 'home', 'mqtt'], tags=['github:alvarowolfx/smart-home-mqtt', 'smart-home-mqtt', 'JavaScript']),
 TaggedDocument(words=['typeorm', 'seeding', 'delightful', 'way', 'seed', 'test', 'data', 'database'], tags=['github:betiol/typeorm-seeding', 'typeorm-seeding', 'TypeScript']),
 TaggedDocument(words=['express', 'response', 'formatter', 'sparkle', 'better', 'way', 'format', 'express', 'response'], tags=['github:betiol/express-response-formatter', 'express-response-formatter', 'TypeScript']),
 TaggedDocument(words=['desafio', 'rn', 'desafio', 'react', 'native'], tags=['github:betiol/desafio-rn', 'desafio-rn', 'TypeScript']),
 Ta

In [12]:
# creating model

VECTOR_SIZE = 100
EPOCH_NUMBER = 10
WORD_MIN_AMOUNT = 3
WINDOW_SIZE = 7

model = gensim.models.doc2vec.Doc2Vec(vector_size = VECTOR_SIZE, window = WINDOW_SIZE, min_count = WORD_MIN_AMOUNT, epochs = EPOCH_NUMBER)
model.build_vocab(documentsCorpus)
model.train(documentsCorpus, total_examples = model.corpus_count, epochs = model.epochs)

In [13]:
vector = model.infer_vector(['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'])
vector

array([ 0.00313717, -0.00293201,  0.0009998 ,  0.00167542,  0.00389882,
       -0.00132042, -0.00303681,  0.0015664 ,  0.00481785,  0.00271726,
        0.00135704,  0.00160926, -0.00209741,  0.00426282,  0.00355371,
        0.00260308, -0.00165984,  0.00481925,  0.00342337, -0.0015115 ,
        0.00202207, -0.00307562, -0.00406634, -0.00023533,  0.00318324,
       -0.0011409 ,  0.0043712 , -0.00073656,  0.00431361, -0.00225338,
       -0.00261571, -0.00113109,  0.00234244,  0.00061574,  0.00230329,
       -0.00201598, -0.00204223,  0.00124622, -0.0003735 ,  0.00372844,
        0.00493998, -0.00459943, -0.00336736,  0.00192828,  0.0025907 ,
       -0.00189576, -0.00282402, -0.0032457 , -0.00096452,  0.00221979,
        0.00365021, -0.0028365 , -0.00177467,  0.00439928, -0.00438924,
        0.0033102 , -0.0048881 , -0.00448874,  0.00417335,  0.00184906,
        0.00368874, -0.00208335, -0.00099384, -0.0003592 , -0.00178554,
       -0.0002647 , -0.00108016, -0.00140156,  0.00393741, -0.00

In [14]:
model.dv[0]

array([-5.2308156e-03, -5.9791268e-03, -9.8807542e-03,  8.5528456e-03,
        3.5661161e-03,  2.6303172e-04, -9.8806275e-03, -5.1666484e-03,
       -9.7179627e-03,  2.0107795e-03,  2.8303110e-03,  4.6435557e-03,
       -4.2972756e-03, -3.1457066e-03, -3.0787874e-03, -8.7219151e-03,
        2.1724831e-03,  9.2256228e-03, -9.5018670e-03, -3.4580862e-03,
       -3.7699090e-03,  2.6073826e-03, -5.6915567e-03,  2.6206803e-03,
        5.8025215e-03, -8.1068603e-03, -8.3297910e-03, -9.9546695e-03,
        4.9330448e-03, -9.1223074e-03,  5.8419635e-03,  6.8002627e-03,
       -6.5064002e-03, -4.5198812e-03, -1.2548614e-03,  1.6463208e-03,
       -1.4813376e-03, -8.5425414e-03, -3.6026132e-03,  1.7316258e-03,
       -2.0569193e-03, -7.2300420e-03,  4.1846000e-03, -8.5743405e-03,
        2.7115368e-03, -4.6137203e-03,  6.4542773e-04, -2.0573472e-03,
        5.4132282e-03, -8.0025708e-03, -2.1198511e-03, -9.5815660e-05,
       -6.6387774e-03, -6.5261638e-03, -1.9329584e-03,  8.8034747e-03,
      