In [1]:
from pymongo import MongoClient

# single machine setup (mongo is running here localy)
MY_DATABASE_LINK = 'mongodb://192.168.1.191:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

class DatabaseConnect:
    DB_LINK = MY_DATABASE_LINK

    class Base:
        client = None
        @classmethod
        def connect(cls, databaseName):
            cls.client = MongoClient(DatabaseConnect.DB_LINK)
            # Access the database
            return cls.client[databaseName]

        @classmethod
        def close(cls):
            if cls.client:
                cls.client.close()
                cls.client = None

        @classmethod
        def getCollection(cls, collectionName):
            return cls.client[collectionName]


    class mini_database(Base):
        @classmethod
        def projects(cls):
            #print(cls.connect)
            
            return cls.connect('mini_database')['projects']
        @classmethod
        def users(cls):
            return cls.connect('mini_database')['users']

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['192.168.1.191:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [2]:
import numpy as np

In [3]:
def getProjectsStars(projectsNum : int) -> list:
    count = projectsNum
    cursor = projectsCollection.find()
    stars = []

    for proj in cursor:
        if count <= 0: break
        stars.append(proj["stars"])
        count -= 1

    return np.array(stars)

def getProjectsForPeriods(stars : list, periodSize = 50):
    class Period(list):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.projectsAmount = 0

        def includes(self, n):
            return (self[0] <= n) and (self[1] >= n)


    maxVal = max(stars)
    print(maxVal)
    # construct periods:
    periods = [Period((i, i + periodSize)) for i in range(0, maxVal + periodSize, periodSize)]
    if maxVal % periodSize: periods += [Period((maxVal - (maxVal % periodSize), maxVal))]

    print(len(periods))
    projectsPerPeriod = [0] * len(periods)
    projectsCursor = projectsCollection.find()

    for projectStars in stars:
        for index, period in enumerate(periods):
            if period.includes(projectStars):
                period.projectsAmount += 1
                #projectsPerPeriod[index] += 1

    return {"periods" : periods, "projects_amount" : projectsPerPeriod}

In [4]:
starsArray = getProjectsStars(1000)

maximum = max(starsArray)
maximum

284894

In [5]:
def percentile_threshold(likes, percentile=50):
    return np.percentile(likes, percentile)

threshold = percentile_threshold(starsArray, 70)  # Keep top 30% posts

print("Threshold:", threshold)
print(f"Original size: {len(starsArray)}")
print(len(starsArray[starsArray > threshold]))

Threshold: 2.0
Original size: 1000
272


In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from copy import deepcopy
import re
import os
import json

class CacheAdapter:
    def __init__(self, collectionName = ""):
        self.collectionName = collectionName

    def load(self):
        return {}

    def save(self, data):
        return {}

class JSONAdapter(CacheAdapter):
    PREPROCESSED_DATA_CACHE_PATH = "/home/trukhinmaksim/src/mycache"

    @classmethod
    def default(cls):
        return cls()
    
    def load(self):
        # will load data from JSON file, argument 'collectionName' is a file name
        if self.collectionName:
            fileName = self.collectionName
        else:
            # take the first file from the directory:
            fileName = next(os.walk(JSONAdapter.PREPROCESSED_DATA_CACHE_PATH))[2][0]

        print(fileName)

        with open(os.path.join(JSONAdapter.PREPROCESSED_DATA_CACHE_PATH, fileName), encoding = "utf-8") as file:
            return json.load(file)

    def save(self, data):
        # will write data into the predefined JSON file
        if self.collectionName:
            fileName = self.collectionName
        else:
            # take the first file from the directory:
            fileName = next(os.walk(JSONAdapter.PREPROCESSED_DATA_CACHE_PATH))[2][0]

        with open(os.path.join(JSONAdapter.PREPROCESSED_DATA_CACHE_PATH, fileName), "w", encoding = "utf-8") as file:
            json.dump(usersProjects, fp = file)

        return data


class ProjectsDatasetManager:
    def __init__(self, userNumber = float("inf"), validate = lambda data: True, cacheAdapter = None):
        self.userNumber = userNumber
        self.validate = validate
        self.data = None
        self.preprocessed = False

        if cacheAdapter == None: 
            self.cacheAdapter = JSONAdapter()
        else:
            self.cacheAdapter = cacheAdapter

    def fromCache(self):
        self.data = self.cacheAdapter.load()

        # it is assumed, that cache only contains already preprocessed data
        self.preprocessed = True
        return self.data

    def fromDB(self):
        self.data = self.getProjectsDataForUsers()
        self.preprocessed = False # assume, that database contains unprocessed data
        return self.data

    def getProjectsDataForUsers(self) -> dict[str, list]:
        # will return a dictionary, where keys are users ids and values are lists of projects ids, each user has contributed to
        i = 0
        count = self.userNumber
        cursor = usersCollection.find()
        data = {}

        for user in cursor:
            if count <= 0: break
            print(f"Scanning user: {i}")
            projectsIDList = user["projects"]

            projects = []

            for proj_id in projectsIDList:
                projectData = projectsCollection.find_one({"id" : proj_id}, {"_id" : False})

                if self.validate(projectData):
                    projects.append(projectData)
        
            if len(projects):
                data[user["id"]] = deepcopy(projects)
                count -= 1

            i += 1

        return data

    def translateText(self, text):
        if text.isascii(): return text
    
        try:
            import asyncio
            import nest_asyncio

            async def inner():
                nonlocal text
                from googletrans import Translator

                async with Translator() as translator:
                    result = await translator.translate(text, dest = "en")

                return result

            nest_asyncio.apply()  # Patch the event loop    
            return asyncio.run(inner()).text

        except Exception as exp:
            if str(type(exp)) == "<class 'httpx.ConnectError'>":
                return text
            else:
                raise exp

    def textPreprocessing(self, text):
        # Initialize tools
        stop_words = set(stopwords.words("english") + ["etc"])
        lemmatizer = WordNetLemmatizer()

        # Translate:
        text = self.translateText(text)
        # Remove unicode:
        text = text.encode("ascii", "ignore").decode()
        # Process camel case:
        #text = processCamelCase(text)
        # Lower the text:
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
        # Remove stop-words:
        #text = re.sub("\s" + "|".join(stop_words) + "\s", " ", text)
        # Remove new lines
        text = re.sub(r"\n", " ", text)
        # Remove multiple spaces:
        text = re.sub("\s+", " ", text).strip()
    
        tokens = [word for word in word_tokenize(text) if not word in stop_words]  # Tokenize into words
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords & lemmatize

        return tokens
    
    def projectsDataPreprocessing(self, projects : np.array(dict)) -> np.array([{"tokens" : str, "tags" : list}]):
        # will take in an array of projects and prepare it to be consumed by the model
        # takes: array of projects (as dictionaries); returns: text data and tags for every project in array
        result = []

        for proj in projects:
            tockens = self.textPreprocessing(" ".join([proj["name"], proj["description"]]))
            tags = [proj["id"], proj["name"], proj["language"]] + proj["topics"]# if proj["language"] else proj["topics"]
            result.append({"tokens" : tockens, "tags" : tags})

        return np.array(result)

    def preprocess(self, _data : dict | None = None) -> dict:
        if self.preprocessed: return self.data

        if _data:
            data = _data
        elif self.data:
            data = self.data
        else:
            return self.fromCache()

        for user_id, projs in data.items():
            #print(type(np.array(userProjs)))
            data[user_id] = self.projectsDataPreprocessing(projs)

        self.preprocessed = True
        return data

In [16]:
def projectDataIsSufficient(projectData):
    return (projectData and projectData["description"] and (len(projectData["topics"]) or projectData["language"]))

adapter = JSONAdapter()
manager = ProjectsDatasetManager(2, projectDataIsSufficient, cacheAdapter = adapter)
manager.fromDB()
manager.preprocess()
manager.data

Scanning user: 0


{'github:betiol': array([{'tokens': ['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'], 'tags': ['github:lucasfloriani/awesome-code-streamers', 'awesome-code-streamers', '', 'awesome', 'awesome-list', 'lists', 'resources']},
        {'tokens': ['smart', 'home', 'mqtt', 'experimental', 'wip', 'bridge', 'google', 'smart', 'home', 'mqtt'], 'tags': ['github:alvarowolfx/smart-home-mqtt', 'smart-home-mqtt', 'JavaScript']},
        {'tokens': ['typeorm', 'seeding', 'delightful', 'way', 'seed', 'test', 'data', 'database'], 'tags': ['github:betiol/typeorm-seeding', 'typeorm-seeding', 'TypeScript']},
        {'tokens': ['express', 'response', 'formatter', 'sparkle', 'better', 'way', 'format', 'express', 'response'], 'tags': ['github:betiol/express-response-formatter', 'express-response-formatter', 'TypeScript']},
        {'tokens': ['desafio', 'rn', 'desafio', 'react', 'native'], 'tags': ['github:betiol/desafio-rn', 'desafio-rn', 'T

In [None]:
#import gensim
#from gensim.models.doc2vec import TaggedDocument
#from nltk.tokenize import word_tokenize