In [1]:
import sys
sys.path.append('/home/trukhinmaksim/src')

In [2]:
from pymongo import MongoClient

# single machine setup (mongo is running here localy)
# "ip a" for ip address
MY_DATABASE_LINK = 'mongodb://10.22.112.39:27020/' #'mongodb://192.168.100.57:27020/'
# multiple mechine setup (mongo is running on another machine)
#MY_DATABASE_LINK = 'mongodb://192.168.43.78:27020/'

class DatabaseConnect:
    DB_LINK = MY_DATABASE_LINK

    class Base:
        client = None
        @classmethod
        def connect(cls, databaseName):
            cls.client = MongoClient(DatabaseConnect.DB_LINK)
            # Access the database
            return cls.client[databaseName]

        @classmethod
        def close(cls):
            if cls.client:
                cls.client.close()
                cls.client = None

        @classmethod
        def getCollection(cls, collectionName):
            return cls.client[collectionName]


    class mini_database(Base):
        @classmethod
        def projects(cls):
            #print(cls.connect)
            
            return cls.connect('mini_database')['projects']
        @classmethod
        def users(cls):
            return cls.connect('mini_database')['users']

projectsCollection = DatabaseConnect.mini_database.projects()
usersCollection = DatabaseConnect.mini_database.users()
print(projectsCollection)

Collection(Database(MongoClient(host=['10.22.112.39:27020'], document_class=dict, tz_aware=False, connect=True), 'mini_database'), 'projects')


In [3]:
import numpy as np

In [4]:
def findApproximateStarsThreshold(projectsNum : int, percentile=50):
    count = projectsNum
    cursor = projectsCollection.find()
    stars = []

    for proj in cursor:
        if count <= 0: break
        stars.append(proj["stars"])
        count -= 1

    stars = np.array(stars)
    print(f"Scanned {projectsNum - count} projects")
    print(f"Maximum stars amount: {stars.max()}")

    threshold = np.percentile(stars, percentile)
    print(f"Number of projects, that has more stars than threshold: {len(stars[stars > threshold])}")

    return threshold

# Keep top 30% of projects
srarsThreshold = findApproximateStarsThreshold(2000, 75)
print(f"Stars threshold: {srarsThreshold}")

Scanned 2000 projects
Maximum stars amount: 284894
Number of projects, that has more stars than threshold: 476
Stars threshold: 2.0


In [5]:
import argostranslate.package
import argostranslate.translate

argostranslate.package.update_package_index()
available_packages = argostranslate.package.get_available_packages()
print(available_packages)
cn_en_pkg = next(filter(lambda pkg: pkg.from_code == "zh" and pkg.to_code == "en", available_packages))
print(cn_en_pkg)
argostranslate.package.install_from_path(cn_en_pkg.download())

[Albanian -> English, Arabic -> English, Azerbaijani -> English, Basque -> English, Bengali -> English, Bulgarian -> English, Catalan -> English, Chinese (traditional) -> English, Chinese -> English, Czech -> English, Danish -> English, Dutch -> English, English -> Albanian, English -> Arabic, English -> Azerbaijani, English -> Basque, English -> Bengali, English -> Bulgarian, English -> Catalan, English -> Chinese, English -> Chinese (traditional), English -> Czech, English -> Danish, English -> Dutch, English -> Esperanto, English -> Estonian, English -> Finnish, English -> French, English -> Galician, English -> German, English -> Greek, English -> Hebrew, English -> Hindi, English -> Hungarian, English -> Indonesian, English -> Irish, English -> Italian, English -> Japanese, English -> Korean, English -> Latvian, English -> Lithuanian, English -> Malay, English -> Norwegian, English -> Persian, English -> Polish, English -> Portuguese, English -> Romanian, English -> Russian, Engli

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from copy import deepcopy
import re

from src.utils.CacheAdapter import JSONAdapter

class ProjectsDatasetManager:
    def __init__(self, userNumber = float("inf"), validate = lambda data: True, cacheAdapter = None):
        self.userNumber = userNumber
        self.validate = validate
        self.data = None
        self.preprocessed = False
        self.ignoredUsers = []
        
        if cacheAdapter == None: 
            self.cacheAdapter = JSONAdapter()
        else:
            self.cacheAdapter = cacheAdapter

    def ignoreUsers(self, users_ids : list[str]):
        self.ignoredUsers.extend(users_ids)
    
    def fromCache(self):
        self.data = self.cacheAdapter.load()

        # it is assumed, that cache only contains already preprocessed data
        self.preprocessed = True
        return self.data

    def fromDB(self):
        self.data = self.getProjectsDataForUsers()
        self.preprocessed = False # assume, that database contains unprocessed data
        return self.data

    def getProjectsDataForUsers(self) -> dict[str, list]:
        # will return a dictionary, where keys are users ids and values are lists of projects ids, each user has contributed to
        i = 0
        count = self.userNumber
        cursor = usersCollection.find()
        data = {}

        for user in cursor:
            if count <= 0: break
            if user["id"] in self.ignoredUsers: continue # if that user must be ignored, just skip to the next one
            print(f"Scanning user: {i}")
            projectsIDList = user["projects"]

            projects = []

            for proj_id in projectsIDList:
                projectData = projectsCollection.find_one({"id" : proj_id}, {"_id" : False})

                if self.validate(projectData):
                    projects.append(projectData)
        
            if len(projects):
                # if user has at least one project he contributed to
                data[user["id"]] = deepcopy(projects)
                count -= 1

            i += 1

        return data

    def translateText(self, text):
        # will try to use Google Translate, but if any error occures, will use Argos offline translator
        if text.isascii(): return text

        try:
            import asyncio
            import nest_asyncio

            async def inner():
                nonlocal text
                from googletrans import Translator

                async with Translator() as translator:
                    result = await translator.translate(text, dest = "en")

                return result

            nest_asyncio.apply()  # Patch the event loop    
            return asyncio.run(inner()).text

        except Exception as exp:
            # assume, that the text is in Chinese and translate it using argos translator
            print(f"Using Argos for {text[:10]}...")
            return argostranslate.translate.translate(text, "zh", "en")
            """
            if str(type(exp)) == "<class 'httpx.ConnectError'>":
                return text
            else:
                raise exp
            """

    def textPreprocessing(self, text):
        # Initialize tools
        stop_words = set(stopwords.words("english") + ["etc"])
        lemmatizer = WordNetLemmatizer()

        # Translate:
        #text = self.translateText(text)
        # Remove unicode:
        text = text.encode("ascii", "ignore").decode()
        # Process camel case:
        #text = processCamelCase(text)
        # Lower the text:
        text = text.lower()
        # Remove punctuation:
        text = text.translate(str.maketrans(string.punctuation, " " * len(string.punctuation)))
        # Remove stop-words:
        #text = re.sub("\s" + "|".join(stop_words) + "\s", " ", text)
        # Remove numbers:
        text = re.sub(r"\d", " ", text)
        # Remove new lines:
        text = re.sub(r"\n", " ", text)
        # Remove multiple spaces:
        text = re.sub("\s+", " ", text).strip()
    
        tokens = [word for word in word_tokenize(text) if word not in stop_words and len(word) > 1]  # Tokenize into words
        
        tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Remove stopwords & lemmatize

        return tokens

    def projectsDataPreprocessing(self, projects : np.array(dict), including_text : bool = False) -> np.array([{"tokens" : str, "tags" : list}]):
        # will take in an array of projects and prepare it to be consumed by the model
        # takes: array of projects (as dictionaries); returns: text data and tags for every project in array
        result = []

        for proj in projects:
            joinedText = " ".join([proj["name"], proj["description"]])

            tockens = self.textPreprocessing(joinedText)
            tags = [proj["id"], proj["name"], proj["language"]] + proj["topics"]# if proj["language"] else proj["topics"]
            if including_text:
                result.append({"text" : joinedText, "tokens" : tockens, "tags" : tags})
            else:
                result.append({"tokens" : tockens, "tags" : tags})

        return np.array(result)

    def preprocess(self, _data : dict | None = None, including_text : bool = False) -> dict[str, list]:
        if self.preprocessed: return self.data

        if _data:
            data = _data
        elif self.data:
            data = self.data
        else:
            return self.fromCache()

        for user_id, projs in data.items():
            #print(type(np.array(userProjs)))
            data[user_id] = self.projectsDataPreprocessing(projs, including_text)

        self.preprocessed = True
        return data

In [7]:
def projectDataIsSufficient(projectData):
    # used to filter data by quality, for example, I can take only those project, that has long description, readme file and many stars
    return (projectData and projectData["description"] and (len(projectData["topics"]) or projectData["language"]))

USERS_NUMBER_TO_SCAN = 5

adapter = JSONAdapter()
manager = ProjectsDatasetManager(USERS_NUMBER_TO_SCAN, projectDataIsSufficient, cacheAdapter = adapter)
manager.fromDB()
manager.preprocess()
manager.data

Scanning user: 0


NameError: name 'deepcopy' is not defined

In [None]:
2/0
import gensim
from gensim.models.doc2vec import TaggedDocument

In [None]:
def flatternData(data : dict[str, list]) -> np.array(dict):
    # takes in data in form of dict, where each key is a user id and each value is a list of that user's projects
    # returns just flat list of these projects 
    result = []

    for projectsArray in data.values():
        for project in projectsArray:
            result.append(project)

    return np.array(result)

manager.data = flatternData(manager.data)

In [None]:
documentsCorpus = []

for project in manager.data:
    documentsCorpus.append(TaggedDocument(words=project["tokens"], tags=project["tags"]))

documentsCorpus

In [None]:
# creating model

VECTOR_SIZE = 100
EPOCH_NUMBER = 10
WORD_MIN_AMOUNT = 3
WINDOW_SIZE = 7

model = gensim.models.doc2vec.Doc2Vec(vector_size = VECTOR_SIZE, window = WINDOW_SIZE, min_count = WORD_MIN_AMOUNT, epochs = EPOCH_NUMBER)
model.build_vocab(documentsCorpus)
model.train(documentsCorpus, total_examples = model.corpus_count, epochs = model.epochs)

In [None]:
vector = model.infer_vector(['awesome', 'code', 'streamer', 'list', 'code', 'streamer', 'multiple', 'plataforms', 'like', 'twitch', 'youtube'])
vector

In [None]:
model.dv[0]