In [1]:
from pymongo import MongoClient

# single machine setup (mongo is running here localy)
# "ip a" for ip address
MY_DATABASE_LINK = 'mongodb://10.22.112.39:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

class DatabaseConnect:
    DB_LINK = MY_DATABASE_LINK

    class Base:
        client = None
        @classmethod
        def connect(cls, databaseName):
            cls.client = MongoClient(DatabaseConnect.DB_LINK)
            # Access the database
            return cls.client[databaseName]

        @classmethod
        def close(cls):
            if cls.client:
                cls.client.close()
                cls.client = None

        @classmethod
        def getCollection(cls, collectionName):
            return cls.client[collectionName]


    class mini_database(Base):
        @classmethod
        def projects(cls):
            #print(cls.connect)
            
            return cls.connect('mini_database')['projects']
        @classmethod
        def users(cls):
            return cls.connect('mini_database')['users']

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['10.22.112.39:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [2]:
import numpy as np

In [3]:
def getProjectsStars(projectsNum : int) -> list:
    count = projectsNum
    cursor = projectsCollection.find()
    stars = []

    for proj in cursor:
        if count <= 0: break
        stars.append(proj["stars"])
        count -= 1

    return np.array(stars)

def getProjectsForPeriods(stars : list, periodSize = 50):
    class Period(list):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.projectsAmount = 0

        def includes(self, n):
            return (self[0] <= n) and (self[1] >= n)


    maxVal = max(stars)
    print(maxVal)
    # construct periods:
    periods = [Period((i, i + periodSize)) for i in range(0, maxVal + periodSize, periodSize)]
    if maxVal % periodSize: periods += [Period((maxVal - (maxVal % periodSize), maxVal))]

    print(len(periods))
    projectsPerPeriod = [0] * len(periods)
    projectsCursor = projectsCollection.find()

    for projectStars in stars:
        for index, period in enumerate(periods):
            if period.includes(projectStars):
                period.projectsAmount += 1
                #projectsPerPeriod[index] += 1

    return {"periods" : periods, "projects_amount" : projectsPerPeriod}

In [4]:
starsArray = getProjectsStars(1000)

maximum = max(starsArray)
maximum

284894

In [5]:
def percentile_threshold(likes, percentile=50):
    return np.percentile(likes, percentile)

threshold = percentile_threshold(starsArray, 70)  # Keep top 30% posts

print("Threshold:", threshold)
print(f"Original size: {len(starsArray)}")
print(len(starsArray[starsArray > threshold]))

Threshold: 2.0
Original size: 1000
272


In [6]:
import argostranslate.package
import argostranslate.translate

argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
print(available_packages)
cn_en_pkg = next(filter(lambda pkg: pkg.from_code == "zh" and pkg.to_code == "en", available_packages))
print(cn_en_pkg)
argostranslate.package.install_from_path(cn_en_pkg.download())

[Albanian -> English, Arabic -> English, Azerbaijani -> English, Basque -> English, Bengali -> English, Bulgarian -> English, Catalan -> English, Chinese (traditional) -> English, Chinese -> English, Czech -> English, Danish -> English, Dutch -> English, English -> Albanian, English -> Arabic, English -> Azerbaijani, English -> Basque, English -> Bengali, English -> Bulgarian, English -> Catalan, English -> Chinese, English -> Chinese (traditional), English -> Czech, English -> Danish, English -> Dutch, English -> Esperanto, English -> Estonian, English -> Finnish, English -> French, English -> Galician, English -> German, English -> Greek, English -> Hebrew, English -> Hindi, English -> Hungarian, English -> Indonesian, English -> Irish, English -> Italian, English -> Japanese, English -> Korean, English -> Latvian, English -> Lithuanian, English -> Malay, English -> Norwegian, English -> Persian, English -> Polish, English -> Portuguese, English -> Romanian, English -> Russian, Engli

In [7]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from copy import deepcopy
import re
import os
import json



class CacheAdapter:
    def __init__(self, collectionName = ""):
        self.collectionName = collectionName

    def load(self):
        return {}

    def save(self, data):
        return {}

class JSONAdapter(CacheAdapter):
    PREPROCESSED_DATA_CACHE_PATH = "/home/trukhinmaksim/src/mycache"

    @classmethod
    def default(cls):
        return cls()
    
    def load(self):
        # will load data from JSON file, argument 'collectionName' is a file name
        if self.collectionName:
            fileName = self.collectionName
        else:
            # take the first file from the directory:
            fileName = next(os.walk(JSONAdapter.PREPROCESSED_DATA_CACHE_PATH))[2][0]

        print(fileName)

        with open(os.path.join(JSONAdapter.PREPROCESSED_DATA_CACHE_PATH, fileName), encoding = "utf-8") as file:
            return json.load(file)

    def save(self, data):
        # will write data into the predefined JSON file
        if self.collectionName:
            fileName = self.collectionName
        else:
            # take the first file from the directory:
            fileName = next(os.walk(JSONAdapter.PREPROCESSED_DATA_CACHE_PATH))[2][0]

        with open(os.path.join(JSONAdapter.PREPROCESSED_DATA_CACHE_PATH, fileName), "w", encoding = "utf-8") as file:
            json.dump(usersProjects, fp = file)

        return data


class ProjectsDatasetManager:
    def __init__(self, userNumber = float("inf"), validate = lambda data: True, cacheAdapter = None):
        self.userNumber = userNumber
        self.validate = validate
        self.data = None
        self.preprocessed = False
        self.ignoredUsers = []
        
        if cacheAdapter == None: 
            self.cacheAdapter = JSONAdapter()
        else:
            self.cacheAdapter = cacheAdapter

    def ignoreUsers(self, users_ids : list[str]):
        self.ignoredUsers.extend(users_ids)
    
    def fromCache(self):
        self.data = self.cacheAdapter.load()

        # it is assumed, that cache only contains already preprocessed data
        self.preprocessed = True
        return self.data

    def fromDB(self):
        self.data = self.getProjectsDataForUsers()
        self.preprocessed = False # assume, that database contains unprocessed data
        return self.data

    def getProjectsDataForUsers(self) -> dict[str, list]:
        # will return a dictionary, where keys are users ids and values are lists of projects ids, each user has contributed to
        i = 0
        count = self.userNumber
        cursor = usersCollection.find()
        data = {}

        for user in cursor:
            if count <= 0: break
            if user["id"] in self.ignoredUsers: continue # if that user must be ignored, just skip to the next one
            print(f"Scanning user: {i}")
            projectsIDList = user["projects"]

            projects = []

            for proj_id in projectsIDList:
                projectData = projectsCollection.find_one({"id" : proj_id}, {"_id" : False})

                if self.validate(projectData):
                    projects.append(projectData)
        
            if len(projects):
                # if user has at least one project he contributed to
                data[user["id"]] = deepcopy(projects)
                count -= 1

            i += 1

        return data

    def translateText(self, text):
        # will try to use Google Translate, but if any error occures, will use Argos offline translator
        if text.isascii(): return text

        try:
            import asyncio
            import nest_asyncio

            async def inner():
                nonlocal text
                from googletrans import Translator

                async with Translator() as translator:
                    result = await translator.translate(text, dest = "en")

                return result

            nest_asyncio.apply()  # Patch the event loop    
            return asyncio.run(inner()).text

        except Exception as exp:
            # assume, that the text is in Chinese and translate it using argos translator
            print(f"Using Argos for {text[:10]}...")
            return argostranslate.translate.translate(text, "zh", "en")
            """
            if str(type(exp)) == "<class 'httpx.ConnectError'>":
                return text
            else:
                raise exp
            """

    def textPreprocessing(self, text):
        # Initialize tools
        stop_words = set(stopwords.words("english") + ["etc"])
        lemmatizer = WordNetLemmatizer()

        # Translate:
        #text = self.translateText(text)
        # Remove unicode:
        text = text.encode("ascii", "ignore").decode()
        # Process camel case:
        #text = processCamelCase(text)
        # Lower the text:
        text = text.lower()
        # Remove punctuation:
        text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
        # Remove stop-words:
        #text = re.sub("\s" + "|".join(stop_words) + "\s", " ", text)
        # Remove numbers:
        text = re.sub(r"\d", " ", text)
        # Remove new lines:
        text = re.sub(r"\n", " ", text)
        # Remove multiple spaces:
        text = re.sub("\s+", " ", text).strip()
    
        tokens = [word for word in word_tokenize(text) if word not in stop_words and len(word) > 1]  # Tokenize into words
        
        tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Remove stopwords & lemmatize

        return tokens

    def projectsDataPreprocessing(self, projects : np.array(dict), including_text : bool = False) -> np.array([{"tokens" : str, "tags" : list}]):
        # will take in an array of projects and prepare it to be consumed by the model
        # takes: array of projects (as dictionaries); returns: text data and tags for every project in array
        result = []

        for proj in projects:
            joinedText = " ".join([proj["name"], proj["description"]])

            tockens = self.textPreprocessing(joinedText)
            tags = [proj["id"], proj["name"], proj["language"]] + proj["topics"]# if proj["language"] else proj["topics"]
            if including_text:
                result.append({"text" : joinedText, "tokens" : tockens, "tags" : tags})
            else:
                result.append({"tokens" : tockens, "tags" : tags})

        return np.array(result)

    def preprocess(self, _data : dict | None = None, including_text : bool = False) -> dict[str, list]:
        if self.preprocessed: return self.data

        if _data:
            data = _data
        elif self.data:
            data = self.data
        else:
            return self.fromCache()

        for user_id, projs in data.items():
            #print(type(np.array(userProjs)))
            data[user_id] = self.projectsDataPreprocessing(projs, including_text)

        self.preprocessed = True
        return data

In [8]:
def projectDataIsSufficient(projectData):
    # used to filter data by quality, for example, I can take only those project, that has long description, readme file and many stars
    return (projectData and projectData["description"] and (len(projectData["topics"]) or projectData["language"]))

USERS_NUMBER_TO_SCAN = 2

adapter = JSONAdapter()
manager = ProjectsDatasetManager(USERS_NUMBER_TO_SCAN, projectDataIsSufficient, cacheAdapter = adapter)
manager.fromDB()
manager.preprocess()
manager.data

Scanning user: 0
Scanning user: 1


{'github:betiol': array([{'tokens': ['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'], 'tags': ['github:lucasfloriani/awesome-code-streamers', 'awesome-code-streamers', '', 'awesome', 'awesome-list', 'lists', 'resources']},
        {'tokens': ['smart', 'home', 'mqtt', 'experimental', 'wip', 'bridge', 'google', 'smart', 'home', 'mqtt'], 'tags': ['github:alvarowolfx/smart-home-mqtt', 'smart-home-mqtt', 'JavaScript']},
        {'tokens': ['typeorm', 'seeding', 'delightful', 'way', 'seed', 'test', 'data', 'database'], 'tags': ['github:betiol/typeorm-seeding', 'typeorm-seeding', 'TypeScript']},
        {'tokens': ['express', 'response', 'formatter', 'sparkle', 'better', 'way', 'format', 'express', 'response'], 'tags': ['github:betiol/express-response-formatter', 'express-response-formatter', 'TypeScript']},
        {'tokens': ['desafio', 'rn', 'desafio', 'react', 'native'], 'tags': ['github:betiol/desafio-rn', 'desafio-rn', 'T

In [9]:
import gensim
from gensim.models.doc2vec import TaggedDocument

In [10]:
def flatternData(data : dict[str, list]) -> np.array(dict):
    # takes in data in form of dict, where each key is a user id and each value is a list of that user's projects
    # returns just flat list of these projects 
    result = []

    for projectsArray in data.values():
        for project in projectsArray:
            result.append(project)

    return np.array(result)

manager.data = flatternData(manager.data)

In [11]:
documentsCorpus = []

for project in manager.data:
    documentsCorpus.append(TaggedDocument(words=project["tokens"], tags=project["tags"]))

documentsCorpus

[TaggedDocument(words=['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'], tags=['github:lucasfloriani/awesome-code-streamers', 'awesome-code-streamers', '', 'awesome', 'awesome-list', 'lists', 'resources']),
 TaggedDocument(words=['smart', 'home', 'mqtt', 'experimental', 'wip', 'bridge', 'google', 'smart', 'home', 'mqtt'], tags=['github:alvarowolfx/smart-home-mqtt', 'smart-home-mqtt', 'JavaScript']),
 TaggedDocument(words=['typeorm', 'seeding', 'delightful', 'way', 'seed', 'test', 'data', 'database'], tags=['github:betiol/typeorm-seeding', 'typeorm-seeding', 'TypeScript']),
 TaggedDocument(words=['express', 'response', 'formatter', 'sparkle', 'better', 'way', 'format', 'express', 'response'], tags=['github:betiol/express-response-formatter', 'express-response-formatter', 'TypeScript']),
 TaggedDocument(words=['desafio', 'rn', 'desafio', 'react', 'native'], tags=['github:betiol/desafio-rn', 'desafio-rn', 'TypeScript']),
 Ta

In [12]:
# creating model

VECTOR_SIZE = 100
EPOCH_NUMBER = 10
WORD_MIN_AMOUNT = 3
WINDOW_SIZE = 7

model = gensim.models.doc2vec.Doc2Vec(vector_size = VECTOR_SIZE, window = WINDOW_SIZE, min_count = WORD_MIN_AMOUNT, epochs = EPOCH_NUMBER)
model.build_vocab(documentsCorpus)
model.train(documentsCorpus, total_examples = model.corpus_count, epochs = model.epochs)

In [15]:
vector = model.infer_vector(['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'])
vector

array([ 2.4907256e-03, -1.1972073e-03,  3.6697024e-03,  2.5265163e-03,
       -2.9415679e-03,  1.7280114e-03,  2.4052144e-03,  4.5089656e-03,
        2.1553964e-03, -1.2659323e-03,  7.9764368e-04,  4.1981335e-03,
       -1.5625963e-03, -5.7110458e-04,  4.6152840e-03, -1.3620424e-03,
        1.5896213e-03,  2.0621258e-03,  3.1796473e-03, -4.9835176e-04,
       -5.6981742e-05,  2.7644669e-03, -3.5679692e-03, -4.4137128e-03,
       -2.9404813e-03, -3.6065257e-03, -1.5626520e-04, -4.8925811e-03,
        3.3532686e-03,  2.0126282e-04,  4.9375761e-03, -4.3983068e-03,
        3.4958250e-03,  3.4702902e-03,  3.3470346e-03,  4.2208643e-03,
       -3.7907152e-03, -1.1489525e-03,  3.0601346e-03,  3.1064309e-03,
       -3.5949261e-04,  2.0132803e-03, -2.4196398e-03, -1.5073681e-03,
       -2.5729353e-03, -1.8021590e-03, -1.3357193e-03, -1.3665584e-03,
        1.0946267e-03,  9.6173107e-04, -7.9723896e-04, -1.9106090e-04,
        4.2299510e-04,  3.8330387e-03, -4.0994855e-03, -1.0056576e-03,
      

In [16]:
model.dv[0]

array([-5.2308156e-03, -5.9791268e-03, -9.8807542e-03,  8.5528456e-03,
        3.5661161e-03,  2.6303172e-04, -9.8806275e-03, -5.1666484e-03,
       -9.7179627e-03,  2.0107795e-03,  2.8303110e-03,  4.6435557e-03,
       -4.2972756e-03, -3.1457066e-03, -3.0787874e-03, -8.7219151e-03,
        2.1724831e-03,  9.2256228e-03, -9.5018670e-03, -3.4580862e-03,
       -3.7699090e-03,  2.6073826e-03, -5.6915567e-03,  2.6206803e-03,
        5.8025215e-03, -8.1068603e-03, -8.3297910e-03, -9.9546695e-03,
        4.9330448e-03, -9.1223074e-03,  5.8419635e-03,  6.8002627e-03,
       -6.5064002e-03, -4.5198812e-03, -1.2548614e-03,  1.6463208e-03,
       -1.4813376e-03, -8.5425414e-03, -3.6026132e-03,  1.7316258e-03,
       -2.0569193e-03, -7.2300420e-03,  4.1846000e-03, -8.5743405e-03,
        2.7115368e-03, -4.6137203e-03,  6.4542773e-04, -2.0573472e-03,
        5.4132282e-03, -8.0025708e-03, -2.1198511e-03, -9.5815660e-05,
       -6.6387774e-03, -6.5261638e-03, -1.9329584e-03,  8.8034747e-03,
      