In [None]:
import pandas as pd
import os
import markdown
from bs4 import BeautifulSoup
import re
import csv
import nltk
import nltk.data
nltk.download("popular")
from nltk import pos_tag
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag_sents
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.util import mark_negation
import emoji
import contractions
import json

def get_lemmatizer_pos(pos):
    """Retrieves the POS-tag in a WordNet-compatible form. It returns a noun tag by default if the word is not
    an adjective, verb, or adverb.

    Args:
        pos: NTLK POS-tag.

    Returns:
        A WordNet-compatible POS-tag string.
    """
    pos_start = pos[0]  # Takes the first letter to simplify the POS tag
    if pos_start == "J":
        return wn.ADJ
    elif pos_start == "V":
        return wn.VERB
    elif pos_start == "R":
        return wn.ADV
    else:
        return wn.NOUN

def tokenize_normalize(
    context,
    sentence_tokenizer=nltk.data.load("tokenizers/punkt/english.pickle"),
    lemmatizer=WordNetLemmatizer(),
    tokenizer=TweetTokenizer(preserve_case=False), # This tokenizer can handle URLs better
    tokenize_numbers=False,
    tokenize_urls=False,
    deemojize=True,
    remove_punct=True,
    handle_negation=True
):
    stop_words = stopwords.words("english")

    punctuations = "!\"“”#$%&'‘’()*+,-./:;<=>?@[\]^_`{|}~‍"

    sentences = sentence_tokenizer.tokenize(context)

    sentences_tokens = [tokenizer.tokenize(sentence) for sentence in sentences]

    tokens_pos = [
        pos_tag
        for pos_tags in pos_tag_sents(sentences_tokens)
        for pos_tag in pos_tags
    ]

    lemmas = [
        lemmatizer.lemmatize(token[0], pos=get_lemmatizer_pos(token[1]))
        for token in tokens_pos
    ]
    
    if handle_negation:
        lemmas = mark_negation(lemmas)

    filtered_lemmas = []
    last_filtered_lemma_index = None
    last_filtered_lemma = None
    for lemma_index, lemma in enumerate(lemmas):

        # Removing variation selectors such as hair/skin color and gender for
        # emojis since they cause noise and tokenization problems:
        if re.sub("[\\uFE00-\\uFE0F♂♀‍]+", "", lemma) == "":
            continue

        # Filters stop words (considers negations):
        if lemma.replace("_NEG", "") in stop_words:
            continue

        # Filters the lemma by searching for "https://," "http://," or "www." using regular expression. If one
        # of them exists, they are not retrieved. Regular expression may seem daunting at first. It is not
        # mandatory, but you can check tutorials like this: https://regexone.com/lesson/introduction_abcs
        if re.search("(https?:\/\/)|(www\.)", lemma):
            if tokenize_urls:
                lemma = ""
            else:
                continue

        # Deemojizes emojis using emeji package (considers negations):
        if deemojize:
            lemma = emoji.demojize(lemma)

        # Filters punctuation (considers negations):
        if (
            remove_punct
            and lemma.replace("_NEG", "").translate(
                lemma.maketrans("", "", punctuations)
            )
            == ""
        ):
            continue

        # Tries to convert a number from string to float while also handling commas and percentage signs. If
        # the token is a number, it is transformed to "" token or not retrieved. If not, it silently
        # ignores the exception and continues.
        try:
            float(lemma.replace(",", "").replace("%", ""))
            if tokenize_numbers:
                lemma = ""
            else:
                continue
        except:
            pass

        filtered_lemmas.append(lemma)
    
    return filtered_lemmas

Processes the paragraphs that come before the first H2 title in the readme file for each repository:

In [37]:
languages = ["Java", "Python"]
readme_folder = "readmes"

for language in languages:
    
    print(f"Processing {language} repositories...")
    
    with open(f"processed_readmes_{language.lower()}.csv", "w", newline="", encoding="UTF-8") as csv_file:
        csv_writer = csv.writer(csv_file, delimiter=";", quotechar="'", quoting=csv.QUOTE_MINIMAL)
        csv_writer.writerow(["ID", "OWNER", "NAME", "CONTEXTS"])
    
        candidate_repos = pd.read_csv(f"candidate_repos_{language.lower()}.csv", delimiter=";")

        for row_id, row in candidate_repos.iterrows():
            readme_path = f"{readme_folder}/{row.OWNER}_{row.NAME}_readme.md"
            if not os.path.isfile(readme_path):
                continue

            with open(readme_path, encoding="UTF-8") as f:
                readme = f.read()
                
            # Not all readme files are perfectly parseable. Some extra steps were needed both before and after parsing the file.
            if "```" in readme:
                readme = readme.split("```")
                if readme[0] == "":
                    readme = "\n".join(readme[1::2])
                else:
                    readme = "\n".join(readme[0::2])

            readme = BeautifulSoup(markdown.markdown(readme), "html.parser")
            for img in readme.select("img"):
                img.extract()

            extracted_text = []
            for e in readme:
                if e.name == "h2":
                    break

                # Clear-text paragraphs are only retrieved if they satisfy certain conditions
                if e.name == "p":
                    e = re.sub("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});", "", e.text)
                    e = contractions.fix(e)
                    if len(cleaned.strip()) > 3:
                        extracted = " ".join([e for e in e.split("\n") if not e.startswith(".. image::") if not e.startswith("image:") if not e.startswith("  :")])
                        if len(set(extracted.strip())) > 3:
                            extracted = tokenize_normalize(extracted)
                            extracted_text.append(extracted)
            
            if extracted_text:
                csv_writer.writerow([row.ID, row.OWNER, row.NAME, json.dumps(extracted_text, ensure_ascii=False)])

Processing Java repositories...
Processing Python repositories...
