In [1]:
import pandas as pd
import numpy as np

from langdetect import detect, LangDetectException

import nltk
import matplotlib.pyplot as plt
# nltk.download('stopwords')
# nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from gensim import corpora
import string
import random
import time 

import torch
from bertopic import BERTopic
# from transformers import BertModel, BertTokenizer
from langdetect import detect
from deep_translator import GoogleTranslator
from sklearn.feature_extraction.text import CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_repo = pd.read_pickle('input_bert_repo.pkl')
input_repo

0        Primary Kite repo — private bits replaced with...
1                    A simple and private bitcoin exchange
2        Codam's own fixed, functioning and open source...
3        Modern, simple and fresh looking glass based o...
4        🥘 Hassle-free Hardhat plugin to compare gas co...
                               ...                        
67799                                                 None
67800    SOL Shotty is a Solana RPC proxy with a little...
67801                     Imitate OpenAI with Local Models
67802                                                 None
67803        Web interface for everything HackRF/Portapack
Name: description, Length: 67804, dtype: object

In [3]:
all_descriptions_repo = input_repo.to_list()
all_descriptions_repo

['Primary Kite repo — private bits replaced with XXXXXXX',
 'A simple and private bitcoin exchange',
 "Codam's own fixed, functioning and open source alternative of the miniLibX. MLX42 is a simple cross-platform graphics library running on GLFW and OpenGL.",
 'Modern, simple and fresh looking glass based on Bootstrap 5 and PHP 8.',
 '🥘 Hassle-free Hardhat plugin to compare gas cost among different Solidity code snippets.',
 'Official repository for the AKS Landing Zone Accelerator program',
 'Official respository for "Band-limited Coordinate Networks for Multiscale Scene Representation" | CVPR 2022',
 'A collection of React components, transcribed from https://vercel.com/design.',
 'Docker image for MISP',
 None,
 'Collection of offline utilities for developers',
 '🦚 Soothing pastel theme for Konsole',
 'SigLib Digital Signal Processing and Machine Learning Library',
 'insject is a tool for poking at containers. It enables you to run an arbitrary command in a container or any mix of Li

In [7]:
match_text = pd.DataFrame(columns=['old_desription','processed_description'])

def preprocess(texts, n):
    processed_texts = []

    for description in texts:
        new_line = {'old_desription': description, 'processed_description': None}
        match_text.loc[len(match_text)] = new_line
        if description is None:
            continue
        try:
            lang = detect(description)
        except LangDetectException:
            continue
        if lang != 'en':
            if len(description) >= 5000:
                continue
            else: 
                try:
                    description = GoogleTranslator(source='auto', target='en').translate(description)
                except (ConnectionResetError, ConnectionError):   
                    time.sleep(120) 
                if description is None:
                    continue
        
        #Strip leading and trailing whitespaces
        text = description.strip()

        #Tokenize and lower-case words
        words = word_tokenize(text.lower())

        #Keep alpha-only words
        # stop_words = set(stopwords.words('english'))
        alpha_words = [word for word in words if word.isalpha()] #and word not in stop_words]
        processed_texts.append(alpha_words)
        new_line['processed_description'] = [alpha_words]
        match_text.loc[len(match_text)-1] = new_line
    if n > 0:                  
        word_freq = Counter([word for sentence in processed_texts for word in sentence])
        common_words = set([word for word, _ in word_freq.most_common(int(n/100*len(word_freq)))])
        rare_words = set([word for word, _ in word_freq.most_common()[:-int(n/100*len(word_freq))-1:-1]])
        processed_texts = [[word for word in sentence if word not in common_words and word not in rare_words] for sentence in processed_texts]
    elif n == 0:
        processed_texts = [[word for word in sentence] for sentence in processed_texts]
    return processed_texts

In [8]:
corpus = preprocess(all_descriptions_repo,0)

In [9]:
with open("corpus_bertopic_repo_0.txt", "w", encoding="utf-8") as file:
    # Iterate over each sublist in the corpus
    for sentence in corpus:
        # Join the words in the sublist into a single string
        line = " ".join(sentence)
        # Write the string to the file followed by a newline character
        file.write(line + "\n")

In [10]:
match_text.to_pickle("descriptions_repo_0.pkl")

In [4]:
corpus = []

with open("corpus_bertopic_repo_0.txt", "r", encoding="utf-8") as file:
    # Iterate over each line in the file
    for line in file:
        # Remove leading and trailing whitespace, then split the line into words
        words = line.strip().split()
        # Append the list of words to the loaded corpus
        corpus.append(words)

dictionary = corpora.Dictionary(corpus)
doc_term_matrix = [dictionary.doc2bow(text) for text in corpus]
docs = [' '.join(doc) for doc in corpus]

In [44]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=0.01, ngram_range=(1, 2))
topic_model = BERTopic(vectorizer_model=vectorizer_model,min_topic_size=100)
topics, _ = topic_model.fit_transform(docs)
topic_matrix = topic_model.get_topic_info()

In [45]:
topic_model.visualize_topics()

In [46]:
pd.set_option("display.max_rows", None)
topics_results = topic_model.get_topics()

for topic_id, words_with_prob in topics_results.items():
    # Extract only the words from the (word, probability) tuples
    words_only = [word for word, _ in words_with_prob]
    print(f"Topic {topic_id}: {words_only}")

Topic -1: ['data', 'https', 'code', 'using', 'library', 'project', 'repository', 'framework', 'learning', 'tool']
Topic 0: ['segmentation', 'cvpr', 'transformer', 'object', 'vision', 'official', 'object detection', 'eccv', 'detection', 'transformers']
Topic 1: ['language', 'language models', 'large language', 'large', 'models', 'language model', 'paper', 'model', 'emnlp', 'acl']
Topic 2: ['rust', 'rust rust', 'written rust', 'written', 'rust implementation', 'rust library', 'implementation', 'library', 'crate', 'rust bindings']
Topic 3: ['database', 'sql', 'postgresql', 'storage', 'sqlite', 'data', 'postgres', 'query', 'file', 'mysql']
Topic 4: ['github', 'github action', 'action', 'github actions', 'actions', 'git', 'pull', 'gitlab', 'workflow', 'workflows']
Topic 5: ['reinforcement', 'reinforcement learning', 'robot', 'learning', 'planning', 'control', 'autonomous', 'ros', 'robotics', 'robots']
Topic 6: ['python', 'python client', 'python library', 'python python', 'python sdk', 'pyt

In [47]:
# Reviewing the topics show that after topic number 13 they are too general and should not be included anymore
topic_model.reduce_topics(docs, nr_topics=15)
topics = topic_model.topics_

In [48]:
pd.set_option("display.max_rows", 20)
topics_results = topic_model.get_topics()

for topic_id, words_with_prob in topics_results.items():
    # Extract only the words from the (word, probability) tuples
    words_only = [word for word, _ in words_with_prob]
    print(f"Topic {topic_id}: {words_only}")

Topic -1: ['data', 'code', 'https', 'using', 'library', 'repository', 'project', 'framework', 'learning', 'tool']
Topic 0: ['rust', 'github', 'python', 'language', 'repository', 'code', 'source', 'library', 'models', 'data']
Topic 1: ['ai', 'learning', 'implementation', 'segmentation', 'pytorch', 'official', 'deep', 'paper', 'image', 'code']
Topic 2: ['react', 'typescript', 'theme', 'template', 'javascript', 'ui', 'library', 'components', 'pastel', 'soothing pastel']
Topic 3: ['kubernetes', 'aws', 'docker', 'azure', 'cloud', 'terraform', 'container', 'kafka', 'cluster', 'using']
Topic 4: ['https', 'contracts', 'protocol', 'smart', 'nft', 'blockchain', 'wallet', 'smart contracts', 'proxy', 'authentication']
Topic 5: ['chatgpt', 'discord', 'bot', 'audio', 'chat', 'telegram', 'music', 'speech', 'video', 'discord bot']
Topic 6: ['android', 'flutter', 'swift', 'app', 'kotlin', 'ios', 'sdk', 'swiftui', 'apps', 'dart']
Topic 7: ['laravel', 'api', 'php', 'json', 'openapi', 'apis', 'rest', 'pac

In [None]:
description_topics = pd.DataFrame(columns=['Description','Topic'])

for i, doc in enumerate(docs):
  topic_id = topics[i]
  new_row = {'Description': doc, "Topic":topic_id}
  description_topics.loc[len(description_topics)] = new_row

In [50]:
pd.set_option("display.max_rows", 20)
description_topics

Unnamed: 0,Description,Topic
0,primary kite repo private bits replaced with x...,0
1,a simple and private bitcoin exchange,4
2,codam own fixed functioning and open source al...,1
3,modern simple and fresh looking glass based on...,-1
4,hardhat plugin to compare gas cost among diffe...,-1
...,...,...
55837,create more robust laravel apis by adding idem...,7
55838,advanced privacy tool for windows,-1
55839,sol shotty is a solana rpc proxy with a little...,-1
55840,imitate openai with local models,1


In [5]:
descriptions = pd.read_pickle("descriptions_repo.pkl")
descriptions = descriptions.dropna(how="all")
descriptions

Unnamed: 0,old_desription,processed_description
0,Primary Kite repo — private bits replaced with...,"[[primary, kite, repo, private, bits, replaced..."
1,A simple and private bitcoin exchange,"[[simple, private, bitcoin, exchange]]"
2,"Codam's own fixed, functioning and open source...","[[codam, fixed, functioning, open, source, alt..."
3,"Modern, simple and fresh looking glass based o...","[[modern, simple, fresh, looking, glass, based..."
4,🥘 Hassle-free Hardhat plugin to compare gas co...,"[[hardhat, plugin, compare, gas, cost, among, ..."
...,...,...
67797,Create more robust Laravel APIs by adding Idem...,"[[create, robust, laravel, apis, adding, idemp..."
67798,Advanced Privacy Tool for Windows,"[[advanced, privacy, tool, windows]]"
67800,SOL Shotty is a Solana RPC proxy with a little...,"[[sol, shotty, solana, rpc, proxy, little, som..."
67801,Imitate OpenAI with Local Models,"[[imitate, openai, local, models]]"


In [52]:
rows_with_nan = descriptions[descriptions.isnull().any(axis=1)]

In [53]:
test = pd.DataFrame()
test["Original Description"] = rows_with_nan['old_desription']
test['Topic'] = None

In [54]:
description_topics = description_topics.dropna()
descriptions = descriptions.dropna()

In [55]:
descriptions = descriptions.reset_index()
descriptions = descriptions.drop(['index'], axis=1)

In [56]:
description_topics["Original Description"] = descriptions["old_desription"]

In [57]:
description_topics = description_topics.drop(['Description'],axis=1)

In [58]:
extended_df = pd.concat([description_topics, test], ignore_index=True)
extended_df

Unnamed: 0,Topic,Original Description
0,0,Primary Kite repo — private bits replaced with...
1,4,A simple and private bitcoin exchange
2,1,"Codam's own fixed, functioning and open source..."
3,-1,"Modern, simple and fresh looking glass based o..."
4,-1,🥘 Hassle-free Hardhat plugin to compare gas co...
...,...,...
55972,,https://openjdk.org/projects/babylon
55973,,https://ui.hoppscotch.io
55974,,https://dl.acm.org/doi/10.1145/3576915.3623209
55975,,https://www.codingame.com/contests/fall-challe...


In [59]:
extended_df.to_pickle("topics_description_repo.pkl")

In [10]:
final_topics = pd.read_pickle("topics_description_repo.pkl")
final_topics['Topic'].value_counts()

Topic
-1    27922
0      9401
1      4480
2      3728
3      2186
4      2099
5      1769
6      1561
7      1040
8       540
9       360
10      337
11      158
12      138
13      123
Name: count, dtype: int64

In [9]:
final_topics['Topic'].value_counts(normalize=True)

Topic
-1    0.500018
0     0.168350
1     0.080226
2     0.066760
3     0.039146
4     0.037588
5     0.031679
6     0.027954
7     0.018624
8     0.009670
9     0.006447
10    0.006035
11    0.002829
12    0.002471
13    0.002203
Name: proportion, dtype: float64

# BERTopic organization level

In [11]:
input_org = pd.read_pickle('input_bert_org.pkl')
input_org

0                                                         
1                    A simple and private bitcoin exchange
2        We prepare the next generation for the jobs of...
3                                                     None
4                                                         
                               ...                        
38727    Supercharge your product distribution and updates
38728                                                     
38729                              Don't trust the robots!
38730                                                     
38731                                                     
Name: description, Length: 38732, dtype: object

In [12]:
all_descriptions_org = input_org.to_list()

In [None]:
match_text = pd.DataFrame(columns=['old_desription','processed_description'])

def preprocess(texts, n):
    processed_texts = []

    for description in texts:
        new_line = {'old_desription': description, 'processed_description': None}
        match_text.loc[len(match_text)] = new_line
        if description is None:
            continue
        try:
            lang = detect(description)
        except LangDetectException:
            continue
        if lang != 'en':
            if len(description) >= 5000:
                continue
            else: 
                try:
                    description = GoogleTranslator(source='auto', target='en').translate(description)
                except (ConnectionResetError, ConnectionError):   
                    time.sleep(120) 
                if description is None:
                    continue
        
        #Strip leading and trailing whitespaces
        text = description.strip()

        #Tokenize and lower-case words
        words = word_tokenize(text.lower())

        #Keep alpha-only words
        # stop_words = set(stopwords.words('english'))
        alpha_words = [word for word in words if word.isalpha()] #and word not in stop_words]
        processed_texts.append(alpha_words)
        new_line['processed_description'] = [alpha_words]
        match_text.loc[len(match_text)-1] = new_line
    if n > 0:                  
        word_freq = Counter([word for sentence in processed_texts for word in sentence])
        common_words = set([word for word, _ in word_freq.most_common(int(n/100*len(word_freq)))])
        rare_words = set([word for word, _ in word_freq.most_common()[:-int(n/100*len(word_freq))-1:-1]])
        processed_texts = [[word for word in sentence if word not in common_words and word not in rare_words] for sentence in processed_texts]
    elif n == 0:
        processed_texts = [[word for word in sentence] for sentence in processed_texts]
    return processed_texts

In [62]:
corpus_org = preprocess(input_org,0)

In [63]:
with open("corpus_bertopic_org_0.txt", "w", encoding="utf-8") as file:
    # Iterate over each sublist in the corpus
    for sentence in corpus_org:
        # Join the words in the sublist into a single string
        line = " ".join(sentence)
        # Write the string to the file followed by a newline character
        file.write(line + "\n")

In [13]:
corpus = []

with open("corpus_bertopic_org.txt", "r", encoding="utf-8") as file:
    # Iterate over each line in the file
    for line in file:
        # Remove leading and trailing whitespace, then split the line into words
        words = line.strip().split()
        # Append the list of words to the loaded corpus
        corpus.append(words)

dictionary = corpora.Dictionary(corpus)
doc_term_matrix = [dictionary.doc2bow(text) for text in corpus]
docs = [' '.join(doc) for doc in corpus]

In [64]:
match_text.to_pickle("descriptions_org_0.pkl")

In [14]:
docs = [doc for doc in docs if doc.strip() != ""]

In [15]:
vectorizer_model = CountVectorizer(stop_words="english", min_df=0.01, ngram_range=(1, 2))
topic_model = BERTopic(vectorizer_model=vectorizer_model,min_topic_size=10)
topics, _ = topic_model.fit_transform(docs)
topic_matrix = topic_model.get_topic_info()

In [16]:
topic_model.visualize_topics()

In [114]:
pd.set_option("display.max_rows", None)
topics_results = topic_model.get_topics()

for topic_id, words_with_prob in topics_results.items():
    # Extract only the words from the (word, probability) tuples
    words_only = [word for word, _ in words_with_prob]
    print(f"Topic {topic_id}: {words_only}")

Topic -1: ['campus', 'strategy', 'version', 'driven', 'members', 'led', 'supporting', 'panel', 'form', 'college']
Topic 0: ['mod', 'mods', 'modding', 'modular', 'geometry', 'versions', 'early', 'redefining', 'directory', 'load']
Topic 1: ['cosmos', 'universe', 'planet', 'sdk', 'soon', 'era', 'star', 'validator', 'registry', 'coming']
Topic 2: ['robots', 'robot', 'robotic', 'motion', 'operate', 'clear', 'union', 'adaptive', 'navigation', 'environments']
Topic 3: ['streaming', 'live', 'apache', 'unreal', 'localization', 'neutral', 'prompt', 'millions', 'matters', 'integrating']
Topic 4: ['repo', 'additional', 'module', 'questions', 'programs', 'maintenance', 'list', 'area', 'officially', 'supported']
Topic 5: ['hpc', 'unit', 'algorithm', 'interests', 'biology', 'alliance', 'single', 'informatics', 'shared', 'precision']
Topic 6: ['visualization', 'centered', 'graphics', 'challenge', 'biomedical', 'document', 'analyze', 'apache', 'comes', 'interactive']
Topic 7: ['asynchronous', 'trading'

In [115]:
# Reviewing the topics show that after topic number 7 they are too general and should not be included anymore
topic_model.reduce_topics(docs, nr_topics=7)
topics = topic_model.topics_

In [116]:
topics_results = topic_model.get_topics()

for topic_id, words_with_prob in topics_results.items():
    # Extract only the words from the (word, probability) tuples
    words_only = [word for word, _ in words_with_prob]
    print(f"Topic {topic_id}: {words_only}")

Topic -1: ['led', 'supporting', 'course', 'health', 'driven', 'campus', 'members', 'college', 'medical', 'runtime']
Topic 0: ['networks', 'storage', 'identity', 'observability', 'hub', 'collective', 'nlp', 'assets', 'iot', 'defi']
Topic 1: ['laravel', 'typescript', 'css', 'serverless', 'templates', 'ruby', 'webassembly', 'cms', 'websites', 'tailwind']
Topic 2: ['bioinformatics', 'biology', 'healthcare', 'health', 'cancer', 'medical', 'genomics', 'molecular', 'medicine', 'protein']
Topic 3: ['julia', 'quantum', 'jupyter', 'qq', 'photonic', 'quantum computers', 'julia julia', 'array types', 'quarkus', 'extensions']
Topic 4: ['edge', 'proxy', 'layer', 'vpn', 'infinite', 'bridge', 'limits', 'pushing', 'pushing boundaries', 'boundaries']
Topic 5: ['supercharge', 'bounty', 'bounties', 'bug', 'bounties visit', 'cluster argo', 'argo', 'xcode workflows', 'argo delivery', 'supercharge xcode']


In [117]:
topic_matrix = topic_model.get_topic_info()
topic_matrix

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4028,-1_led_supporting_course_health,"[led, supporting, course, health, driven, camp...","[bnb chain bnb chain bnb beacon chain, scala a..."
1,0,13583,0_networks_storage_identity_observability,"[networks, storage, identity, observability, h...","[identity, tracking graph neural networks, chi..."
2,1,831,1_laravel_typescript_css_serverless,"[laravel, typescript, css, serverless, templat...","[laravel, laravel, tailwind css]"
3,2,236,2_bioinformatics_biology_healthcare_health,"[bioinformatics, biology, healthcare, health, ...","[bioinformatics, bioinformatics, bioinformatic..."
4,3,162,3_julia_quantum_jupyter_qq,"[julia, quantum, jupyter, qq, photonic, quantu...","[julia, julia, julia]"
5,4,152,4_edge_proxy_layer_vpn,"[edge, proxy, layer, vpn, infinite, bridge, li...","[edge, edge, edge]"
6,5,24,5_supercharge_bounty_bounties_bug,"[supercharge, bounty, bounties, bug, bounties ...","[supercharge cluster argo delivery, supercharg..."


In [None]:
description_topics = pd.DataFrame(columns=['Description','Topic'])

for i, doc in enumerate(docs):
  topic_id = topics[i]
  new_row = {'Description': doc, "Topic":topic_id}
  description_topics.loc[len(description_topics)] = new_row
description_topics

In [119]:
descriptions = pd.read_pickle("descriptions_org.pkl")
descriptions = descriptions.dropna(how="all")

In [120]:
rows_with_nan = descriptions[descriptions.isnull().any(axis=1)]
len(rows_with_nan)

13149

In [121]:
test = pd.DataFrame()
test["Original Description"] = rows_with_nan['old_desription']
test['Topic'] = None

In [122]:
description_topics = description_topics.dropna()
descriptions = descriptions.dropna()

In [123]:
descriptions = descriptions.reset_index()
descriptions = descriptions.drop(['index'], axis=1)

In [None]:
description_topics["Original Description"] = descriptions["old_desription"]
description_topics = description_topics.drop(['Description'],axis=1)
description_topics

In [126]:
extended_df = pd.concat([description_topics, test], ignore_index=True)
extended_df

Unnamed: 0,Topic,Original Description
0,0,A simple and private bitcoin exchange
1,0,We prepare the next generation for the jobs of...
2,0,"APIs, SDKs and open source projects from Micro..."
3,0,Next-generation computational imaging and disp...
4,0,Národní úřad pro kybernetickou a informační be...
...,...,...
32160,,
32161,,
32162,,
32163,,


In [127]:
extended_df = extended_df.dropna()
extended_df.to_pickle("topics_description_org.pkl")

In [17]:
final_topics = pd.read_pickle("topics_description_org.pkl")

In [19]:
final_topics['Topic'].value_counts()

Topic
0     13583
-1     4028
1       831
2       236
3       162
4       152
5        24
Name: count, dtype: int64

In [18]:
final_topics['Topic'].value_counts(normalize=True)

Topic
0     0.714293
-1    0.211822
1     0.043700
2     0.012411
3     0.008519
4     0.007993
5     0.001262
Name: proportion, dtype: float64

# BERTopic all repos level

In [2]:
import pickle
with open("all_input","rb") as fp:
    input = pickle.load(fp)

In [None]:
flattened_input = [item for sublist in input for item in sublist]

In [7]:
match_text = pd.DataFrame(columns=['old_desription','processed_description'])

def preprocess(texts, n):
    processed_texts = []

    for description in texts:
        new_line = {'old_desription': description, 'processed_description': None}
        match_text.loc[len(match_text)] = new_line
        if description is None:
            continue
        try:
            lang = detect(description)
        except LangDetectException:
            continue
        if lang != 'en':
            if len(description) >= 5000:
                continue
            else: 
                try:
                    description = GoogleTranslator(source='auto', target='en').translate(description)
                except (ConnectionResetError, ConnectionError):   
                    time.sleep(120) 
                if description is None:
                    continue
        
        #Strip leading and trailing whitespaces
        text = description.strip()

        #Tokenize and lower-case words
        words = word_tokenize(text.lower())

        #Keep alpha-only words
        # stop_words = set(stopwords.words('english'))
        alpha_words = [word for word in words if word.isalpha()] #and word not in stop_words]
        processed_texts.append(alpha_words)
        new_line['processed_description'] = [alpha_words]
        match_text.loc[len(match_text)-1] = new_line
    if n > 0:                  
        word_freq = Counter([word for sentence in processed_texts for word in sentence])
        common_words = set([word for word, _ in word_freq.most_common(int(n/100*len(word_freq)))])
        rare_words = set([word for word, _ in word_freq.most_common()[:-int(n/100*len(word_freq))-1:-1]])
        processed_texts = [[word for word in sentence if word not in common_words and word not in rare_words] for sentence in processed_texts]
    elif n == 0:
        processed_texts = [[word for word in sentence] for sentence in processed_texts]
    return processed_texts

In [8]:
corpus = preprocess(flattened_input,0)

RequestError: Request exception can happen due to an api connection error. Please check your connection and try again

In [None]:
with open("corpus_bertopic_all_repos.txt", "w", encoding="utf-8") as file:
    # Iterate over each sublist in the corpus
    for sentence in corpus:
        # Join the words in the sublist into a single string
        line = " ".join(sentence)
        # Write the string to the file followed by a newline character
        file.write(line + "\n")

In [None]:
match_text.to_pickle("descriptions_all_repos.pkl")