In [1]:
from pymongo import MongoClient

# single machine setup (mongo is running here localy)
MY_DATABASE_LINK = 'mongodb://192.168.1.191:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

class DatabaseConnect:
    DB_LINK = MY_DATABASE_LINK

    class Base:
        client = None
        @classmethod
        def connect(cls, databaseName):
            cls.client = MongoClient(DatabaseConnect.DB_LINK)
            # Access the database
            return cls.client[databaseName]

        @classmethod
        def close(cls):
            if cls.client:
                cls.client.close()
                cls.client = None

        @classmethod
        def getCollection(cls, collectionName):
            return cls.client[collectionName]


    class mini_database(Base):
        @classmethod
        def projects(cls):
            #print(cls.connect)
            
            return cls.connect('mini_database')['projects']
        @classmethod
        def users(cls):
            return cls.connect('mini_database')['users']

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['192.168.1.191:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [2]:
import numpy as np

In [3]:
def getProjectsStars(projectsNum : int) -> list:
    count = projectsNum
    cursor = projectsCollection.find()
    stars = []

    for proj in cursor:
        if count <= 0: break
        stars.append(proj["stars"])
        count -= 1

    return np.array(stars)

def getProjectsForPeriods(stars : list, periodSize = 50):
    class Period(list):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)
            self.projectsAmount = 0

        def includes(self, n):
            return (self[0] <= n) and (self[1] >= n)


    maxVal = max(stars)
    print(maxVal)
    # construct periods:
    periods = [Period((i, i + periodSize)) for i in range(0, maxVal + periodSize, periodSize)]
    if maxVal % periodSize: periods += [Period((maxVal - (maxVal % periodSize), maxVal))]

    print(len(periods))
    projectsPerPeriod = [0] * len(periods)
    projectsCursor = projectsCollection.find()

    for projectStars in stars:
        for index, period in enumerate(periods):
            if period.includes(projectStars):
                period.projectsAmount += 1
                #projectsPerPeriod[index] += 1

    return {"periods" : periods, "projects_amount" : projectsPerPeriod}

In [4]:
starsArray = getProjectsStars(1000)

maximum = max(starsArray)
maximum

284894

In [5]:
def percentile_threshold(likes, percentile=50):
    return np.percentile(likes, percentile)

threshold = percentile_threshold(starsArray, 70)  # Keep top 30% posts

print("Threshold:", threshold)
print(f"Original size: {len(starsArray)}")
print(len(starsArray[starsArray > threshold]))

Threshold: 2.0
Original size: 1000
272


In [7]:
from copy import deepcopy

def projectDataIsValid(projectData : dict):
    # will check if project data can actually be used in the training (can specify other conditions)
    return projectData and len(projectData["description"]) > 0

def getProjectsDataForUsers(usersNumber = float("inf")) -> dict[str, list]:
    # will return a dictionary, where keys are users ids and values are lists of projects ids, each user has contributed to
    i = 0
    count = usersNumber
    cursor = usersCollection.find()
    data = {}

    for user in cursor:
        if count <= 0: break
        print(f"Scanning user: {i}")
        projectsIDList = user["projects"]

        projects = []

        for proj_id in projectsIDList:
            projectData = projectsCollection.find_one({"id" : proj_id}, {"_id" : False})

            if projectDataIsValid(projectData):
                projects.append(projectData)
        
        if len(projects):
            data[user["id"]] = deepcopy(projects)
            count -= 1

        i += 1

    return data

usersProjects = getProjectsDataForUsers(2)

Scanning user: 0
Scanning user: 1


In [40]:
def translateText(text):
    if text.isascii(): return text

    try:
        async def inner():
            nonlocal text
            from googletrans import Translator

            async with Translator() as translator:
                result = await translator.translate(text, dest = "en")

            return result
            
        import asyncio
        import nest_asyncio

        nest_asyncio.apply()  # Patch the event loop    
        return asyncio.run(inner()).text

    except Exception as exp:
        if str(type(exp)) == "<class 'httpx.ConnectError'>":
            return text
        else:
            raise exp

In [41]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

def textPreprocessing(text):
    # Initialize tools
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()

    # Translate:
    text = translateText(text)
    # Remove unicode:
    text = text.encode("ascii", "ignore").decode()
    # Process camel case:
    #text = processCamelCase(text)
    # Lower the text:
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
    # Remove stop-words:
    #text = re.sub("\s" + "|".join(stop_words) + "\s", " ", text)
    # Remove new lines
    text = re.sub(r"\n", " ", text)
    # Remove multiple spaces:
    text = re.sub("\s+", " ", text).strip()

    tokens = [word for word in word_tokenize(text) if not word in stop_words]  # Tokenize into words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords & lemmatize

    return tokens


def projectsDataPreprocessing(projects : np.array(dict)) -> np.array([{"tokens" : str, "tags" : list}]):
    # will take in an array of projects and prepare it to be consumed by the model
    # takes: array of projects (as dictionaries); returns: text data and tags for every project in array
    result = []

    for proj in projects:
        text = textPreprocessing(" ".join([proj["name"], proj["description"]]))
        tags = [proj["id"], proj["name"], proj["language"]] + proj["topics"] if proj["language"] else proj["topics"]
        result.append({"tokens" : text, "tags" : tags})

    return result

processedProjects = {}
for user_id, userProjs in usersProjects.items():
    #print(type(np.array(userProjs)))
    processedProjects[user_id] = projectsDataPreprocessing(userProjs)

#textPreprocessing("JPush's officially supported React Native plugin (Android & iOS). 你好")
#re.sub("\s(" + "|".join(stop_words) + ")\s", " ", " on sdf of sdfx a kjn the")

In [37]:
usersProjects

{'github:betiol': [{'id': 'github:lucasfloriani/awesome-code-streamers',
   'name': 'awesome-code-streamers',
   'description': 'List of code streamers from multiples plataforms like Twitch, Youtube, etc',
   'language': '',
   'topics': ['awesome', 'awesome-list', 'lists', 'resources'],
   'stars': 57},
  {'id': 'github:alvarowolfx/smart-home-mqtt',
   'name': 'smart-home-mqtt',
   'description': 'Experimental and WIP - Bridge between Google Smart Home and MQTT ',
   'language': 'JavaScript',
   'topics': [],
   'stars': 10},
  {'id': 'github:betiol/typeorm-seeding',
   'name': 'typeorm-seeding',
   'description': 'A delightful way to seed test data into your database.',
   'language': 'TypeScript',
   'topics': [],
   'stars': 1},
  {'id': 'github:betiol/express-response-formatter',
   'name': 'express-response-formatter',
   'description': ':sparkles: Better way to format Express response',
   'language': 'TypeScript',
   'topics': [],
   'stars': 1},
  {'id': 'github:betiol/desafio

In [38]:
processedProjects

{'github:betiol': [{'tokens': ['awesome',
    'code',
    'streamer',
    'list',
    'code',
    'streamer',
    'multiple',
    'plataforms',
    'like',
    'twitch',
    'youtube',
    'etc'],
   'tags': ['awesome', 'awesome-list', 'lists', 'resources']},
  {'tokens': ['smart',
    'home',
    'mqtt',
    'experimental',
    'wip',
    'bridge',
    'google',
    'smart',
    'home',
    'mqtt'],
   'tags': ['github:alvarowolfx/smart-home-mqtt',
    'smart-home-mqtt',
    'JavaScript']},
  {'tokens': ['typeorm',
    'seeding',
    'delightful',
    'way',
    'seed',
    'test',
    'data',
    'database'],
   'tags': ['github:betiol/typeorm-seeding', 'typeorm-seeding', 'TypeScript']},
  {'tokens': ['express',
    'response',
    'formatter',
    'sparkle',
    'better',
    'way',
    'format',
    'express',
    'response'],
   'tags': ['github:betiol/express-response-formatter',
    'express-response-formatter',
    'TypeScript']},
  {'tokens': ['desafio', 'rn', 'desafio', 'reac

'qwer sdfg sdfg sdfg sdf sdfg '