In [13]:
from bs4 import BeautifulSoup as bs
import bs4
import mistune
from timeout_decorator import TimeoutError, timeout
from IPython.display import display, Markdown
from fastai.core import compose, listify, partial
from fastai.text.transform import fix_html, SpacyTokenizer
from typing import List, Union, Callable
from urllib3.util import parse_url 
from tqdm import tqdm_notebook
import re
import ast
from textacy.preprocess import preprocess_text, normalize_whitespace
from textacy.text_utils import detect_language

## Test Markdown

In [3]:
txt_generalize_symbols = partial(preprocess_text, 
                                 fix_unicode=True, 
                                 no_urls=True, 
                                 no_emails=True, 
                                 no_phone_numbers=True)



detect_language('Hello world!')

'en'

In [8]:
mdtext = """# This is a test markdown that has different types of formatting
## Header 2
 Hello **world** this is going to be a long sentence that. also has a newline:
 lala lala
 
continuing the sentence here.
more text.
 
 - bullet 1
 - bullet 2
 
"quoted text"

#### a small header

```python
def something(x):
    \"""docstring\"""
    #some comment
    somevar = 123
    anothervar = somevar * 3
    for i in range(33):
        anothervar *= i
    
    if i % 2 == 0:
        return True
    else:
        return False
```

@mention somebody

> blockquote text is here!
 
 [hobbit-hole][1]
 
 1. first
 
 2. second
 

 > what this?
 
 ![Tux, the Linux mascot](https://d33wubrfki0l68.cloudfront.net/e7ed9fe4bafe46e275c807d63591f85f9ab246ba/e2d28/assets/images/tux.png)
 
 
 [1]: <https://en.wikipedia.org/wiki/Hobbit#Lifestyle> "Hobbit lifestyles"
 
| First Header  | Second Header |
| ------------- | ------------- |
| Content Cell  | Content Cell  |
| Content Cell  | Content Cell  |
 
 ----
 
 Hello `something` here and __stuff__ is _there_.  [google](www.google.com). random text
    
~~The world is flat.~~ We now know that the world is round.

- [x] Write the press release
- [ ] Update the website
- [ ] Contact the media


`http://www.example.com`

 """

display(Markdown(mdtext))

# This is a test markdown that has different types of formatting
## Header 2
 Hello **world** this is going to be a long sentence that. also has a newline:
 lala lala
 
continuing the sentence here.
more text.
 
 - bullet 1
 - bullet 2
 
"quoted text"

#### a small header

```python
def something(x):
    """docstring"""
    #some comment
    somevar = 123
    anothervar = somevar * 3
    for i in range(33):
        anothervar *= i
    
    if i % 2 == 0:
        return True
    else:
        return False
```

@mention somebody

> blockquote text is here!
 
 [hobbit-hole][1]
 
 1. first
 
 2. second
 

 > what this?
 
 ![Tux, the Linux mascot](https://d33wubrfki0l68.cloudfront.net/e7ed9fe4bafe46e275c807d63591f85f9ab246ba/e2d28/assets/images/tux.png)
 
 
 [1]: <https://en.wikipedia.org/wiki/Hobbit#Lifestyle> "Hobbit lifestyles"
 
| First Header  | Second Header |
| ------------- | ------------- |
| Content Cell  | Content Cell  |
| Content Cell  | Content Cell  |
 
 ----
 
 Hello `something` here and __stuff__ is _there_.  [google](www.google.com). random text
    
~~The world is flat.~~ We now know that the world is round.

- [x] Write the press release
- [ ] Update the website
- [ ] Contact the media


`http://www.example.com`

 

In [9]:
markdown = mistune.Markdown()
parsed = markdown(mdtext)

## The Markdown File Rendered as HTML

In [10]:
soup = bs(parsed)
soup

<html><head></head><body><h1>This is a test markdown that has different types of formatting</h1>
<h2>Header 2</h2>
<p>Hello <strong>world</strong> this is going to be a long sentence that. also has a newline:
 lala lala</p>
<p>continuing the sentence here.
more text.</p>
<ul>
<li>bullet 1</li>
<li>bullet 2</li>
</ul>
<p>"quoted text"</p>
<h4>a small header</h4>
<pre><code class="lang-python">def something(x):
    """docstring"""
    #some comment
    somevar = 123
    anothervar = somevar * 3
    for i in range(33):
        anothervar *= i

    if i % 2 == 0:
        return True
    else:
        return False
</code></pre>
<p>@mention somebody</p>
<blockquote><p>blockquote text is here!</p>
</blockquote>
<p><a href="https://en.wikipedia.org/wiki/Hobbit#Lifestyle" title="Hobbit lifestyles">hobbit-hole</a></p>
<ol>
<li><p>first</p>
</li>
<li><p>second</p>
</li>
</ol>
<blockquote><p>what this?</p>
</blockquote>
<p><img alt="Tux, the Linux mascot" src="https://d33wubrfki0l68.cloudfront.net

### Lang detection

```python
textacy.text_utils.detect_language(text) == "en"
```

# Rendered as Plain Text With Annotations

In [106]:
#export
class md:
    @staticmethod
    def parse(x:str) -> bs4.BeautifulSoup:
        
        # find & replace html, which can break things (non-greedy)
        x = re.sub(r'<.+?>.+?</.+?>|<[a-zA-Z]{1,}.*?>', 'xxxhtml', x, re.DOTALL)
        
        #because former html replacement was non-greedy dedupe html marker
        x = re.sub('(xxxhtml(xxxlnbrk)?(\s)?)+', 'xxxhtml', x)
        
        # fix the linebreak issue from BigQuery
        x = re.sub(r'xxxlnbrk', '\n', x)
       
        @timeout(1)
        def timed_parse(x):
            try:
                return bs(markdown(x))
            
            except TimeoutError:
                return bs(markdown('xxxunabletoparse'))
            
        return timed_parse(x)
    
    @staticmethod
    def prepend(fldname:str, tag:Union[List[str], str], soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
        for tag in soup.find_all(listify(tag)):
            if tag.text.strip() or tag.name == 'hr':
                tag.insert(0, fldname+' ')
        return soup
    
    @staticmethod
    def enclose(bfldname:str, efldname:str, tag:Union[List[str], str], nlines:int, soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
        """Helper function for when you want to add a beginning and ending marker to text."""
        for tag in soup.find_all(listify(tag)):
            
            # preview the text inside an enclosure show nlines of beginning and nlines of the end.
            text_lines = tag.text.split('\n')
            if len(text_lines) <= nlines * 2:
                newstr = tag.text
            else:
                newstr = '\n'.join(text_lines[:nlines] + text_lines[-nlines:])
                
            tag.string = newstr
            
            # add the values of the class attributes, if exist
            tag.insert(0, bfldname + ' ' + (' '.join(tag['class']) if 'class' in tag.attrs else '') + ' ')
            
            # insert ending tag with/without space depending if last char is \n
            if tag.text[-1] == '\n':
                tag.append(efldname)
            else:
                tag.append(' ' + efldname)
        return soup
    
    @staticmethod
    def lst(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
        "annotate list elements <ul> and <ol>"
        for tag in soup.find_all(['ul', 'ol']):
            # clear all the artifacts that are in lists and replace with text.
            text = 'xxxlistB ' + tag.getText() + 'xxxlistE'
            tag.string = text.strip()
        return soup
    
    @staticmethod
    def tbl(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
        "annotate table elements <table> only keeping information from header rows"
        for tag in soup.find_all('table'):
            # empty string if there are no table headers.
            text = ''
            if tag.thead:
                text = 'xxtbl ' + '|'.join([x.getText() for x in tag.thead.find_all('th')])
            tag.string = text
        return soup
    
    @staticmethod
    def img(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
        for tag in soup.find_all('img'):
            tag.insert(0, 'xxximg ')
            if 'alt' in tag.attrs:
                tag.insert(1, tag['alt'])
            if 'src' in tag.attrs:
                tag.append(' xxximgf ' + tag['src'].split('.')[-1])
        return soup
    
    @staticmethod
    def lnk(soup:bs4.BeautifulSoup) -> bs4.BeautifulSoup:
        for tag in soup.find_all('a'):
            if 'href' in tag.attrs:
                try:
                    tag.append(' xxxlnkhb ' + parse_url(tag['href']).host + ' xxxlnkhe')
                except:
                    pass
            if 'title' in tag.attrs:
                tag.append(' xxxlnktb ' + tag['title'] + 'xxxlnkte')
        return soup
    
    @staticmethod
    def get_text(soup:bs4.BeautifulSoup) -> str:
        "get the raw text"
        text = soup.getText()
        #translate newlines back from BigQuery
        text = re.sub(r'\n\n+', '\n', text)
        #translate double quotes back from BigQuery
        text = re.sub(r'xxxdblqte', '\"', text)
        return normalize_whitespace(text)
    
    @staticmethod
    def sym(text:str) -> str:
        """generalize symbols such as urls, emails, phone numbers and filepaths to generic tokens."""
        text = preprocess_text(text, 
                               fix_unicode=True, 
                               no_urls=True, 
                               no_emails=True, 
                               no_phone_numbers=True,
                               no_accents=True)
        
        file_path_regex = r'C:(\\\\\S+){2,}|(/\S+){2,}'
        return re.sub(file_path_regex, ' xxxfilepath ', text)
            
    ### transformations that are the same from factory functions
    # large headers: h1
    hL =   partial(prepend.__func__, 'xxxhl', 'h1')
    # medium headers: h2, h3
    hM =   partial(prepend.__func__, 'xxxhm', ['h2', 'h3'])
    # small headers: h4, h5, h6
    hS =   partial(prepend.__func__, 'xxxhs', ['h4', 'h5', 'h6'])
    # code blocks
    code = partial(enclose.__func__, 'xxxcdb', 'xxxcde', 'code', 2)
    # paragraph blocks (plain text)
    txt =  partial(prepend.__func__, 'xxxtxt', 'p')
    # block quotes
    bqt =  partial(enclose.__func__, 'xxxqb', 'xxxqe', 'blockquote', 3)
    # strikethrough
    st =   partial(enclose.__func__, 'xxxdelb', 'xxxdele', 'del', 1)
    # horizontal rule
    hr =   partial(prepend.__func__, 'xxxhr', 'hr')
    

transform_funcs = [md.parse, md.hL, md.hM, md.hS, md.lst, md.bqt, 
                   md.code, md.tbl, md.st, md.txt, md.lnk, md.img, 
                   md.hr, md.get_text, md.sym]

apply all of the cleanup functions in `transform_funcs` in order.

## Display Cleaned Text

In [120]:
res = compose(transform_funcs)(mdtext)
print(res)

xxxhl This is a test markdown that has different types of formatting
xxxhm Header 2
xxxtxt Hello world this is going to be a long sentence that. also has a newline:
 lala lala
xxxtxt continuing the sentence here.
more text.
xxxlistB bullet 1
bullet 2
xxxlistE
xxxtxt "quoted text"
xxxhs a small header
xxxcdb lang-python def something(x):
 """docstring"""
 return False
xxxcde
xxxtxt @mention somebody
xxxqb blockquote text is here!
xxxqe
xxxtxt [hobbit-hole][1]
xxxlistB first
second
xxxlistE
xxxqb what this?
xxxqe
xxximg Tux, the Linux mascot xxximgf png
xxxtxt [1]: xxxhtml"Hobbit lifestyles"
xxtbl First Header|Second Header
xxxhr xxxtxt Hello xxxcdb something xxxcde here and stuff is there. google xxxlnkhb *URL* xxxlnkhe. random text
xxxtxt xxxdelb The world is flat. xxxdele We now know that the world is round.
xxxlistB [x] Write the press release
[ ] Update the website
[ ] Contact the media
xxxlistE
xxxtxt xxxcdb *URL* xxxcde


# Test on Real Issues

#### Download Sample File

In [125]:
import pandas as pd
import codecs
pd.set_option('max_colwidth', 3000)

df = pd.concat([pd.read_csv(f'https://storage.googleapis.com/issue_label_bot/language_model_data/00000000000{i}.csv.gz' 
               )
                for i in range(1)])

df['clean_body'] = ''

#### Display Dataframe With Cleaned Issue Bodies see `cleaned_body` field

In [123]:
sdf = df.sample(20)

for i, b in tqdm_notebook(enumerate(sdf.body), total=len(sdf)):
    try:
        sdf['clean_body'].iloc[i] = compose(transform_funcs)(b)
    except:
        print(f'error at: {i}')
        break

sdf[['url', 'body', 'clean_body']]

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




Unnamed: 0,url,body,clean_body
121568,https://github.com/flutter/flutter/issues/7626,"Tickers return Futures.xxxlnbrkIt's possible for the following sequence of events to happen:xxxlnbrk - a transient callback for a ticker results in that ticker's future completingxxxlnbrk - during build, we tear down the worldxxxlnbrk - after the frame, the ticker's future's callbacks get called, expecting the world to still be aroundxxxlnbrkThis leads to people writing pretty-looking code tha...","xxxtxt Tickers return Futures.\nIt's possible for the following sequence of events to happen:\nxxxlistB a transient callback for a ticker results in that ticker's future completing\nduring build, we tear down the world\nafter the frame, the ticker's future's callbacks get called, expecting the world to still be around\nThis leads to people writing pretty-looking code that just doesn't work in ..."
15410,https://github.com/spring-cloud/spring-cloud-contract/issues/1045,"## Summary of problemxxxlnbrkIn consumer, contracts placed on a classpath are not found if `StubsMode.CLASSPATH` is used together with JUnit 4's `StubRunnerRule`. Everything works fine with `AutoConfigureStubRunner` when the same parameters are used.xxxlnbrkThe immediate reason of that behavior is provided later on in this report. xxxlnbrk## Steps to reproducexxxlnbrk1. Put a generated xxxdblq...","xxxhm Summary of problem\nxxxtxt In consumer, contracts placed on a classpath are not found if xxxcdb StubsMode.CLASSPATH xxxcde is used together with JUnit 4's xxxcdb StubRunnerRule xxxcde. Everything works fine with xxxcdb AutoConfigureStubRunner xxxcde when the same parameters are used.\nThe immediate reason of that behavior is provided later on in this report.\nxxxhm Steps to reproduce\nxx..."
175946,https://github.com/jpsim/SourceKitten/issues/559,"We have a server-side project (using SPM) and have a script to generate documentation for it. However, when trying to migrate to Swift 4.2 (from 4.1.2), we noticed that in our docker container on CI, the documentation couldn't be generated anymore for one of the modules (the other modules in our project worked fine). Basically, what happened was that the documentation generation started becomi...","xxxtxt We have a server-side project (using SPM) and have a script to generate documentation for it. However, when trying to migrate to Swift 4.2 (from 4.1.2), we noticed that in our docker container on CI, the documentation couldn't be generated anymore for one of the modules (the other modules in our project worked fine). Basically, what happened was that the documentation generation started..."
163408,https://github.com/nuxsmin/docker-syspass/issues/3,"have installed syspass via `docker compose`, but there is not 2fa available in the preferences panel for a user. The live demo shows this, and I have reinstalled several times to ensure I was not missing a configuration step.","xxxtxt have installed syspass via xxxcdb docker compose xxxcde, but there is not 2fa available in the preferences panel for a user. The live demo shows this, and I have reinstalled several times to ensure I was not missing a configuration step."
82431,https://github.com/reek/anti-adblock-killer/issues/1739,"@reek xxxlnbrkCan you add the following to block ads on respective sitexxxlnbrkndtv.com `@@||ndtv.com^*/taboola*/*/*`xxxlnbrkvodlocker `@@||mgid.com^$script,domain=vodlocker.com`","xxxtxt @reek Can you add the following to block ads on respective site\nndtv.com xxxcdb @@||ndtv.com^* xxxfilepath xxxcde\nvodlocker xxxcdb @@||mgid.com^$script,domain=vodlocker.com xxxcde"
173079,https://github.com/sensebox/openSenseMap/issues/179,Add a save as .PNG function to the plots. At the moment the online possibility is to take a screenshot.,xxxtxt Add a save as .PNG function to the plots. At the moment the online possibility is to take a screenshot.
179829,https://github.com/osxfuse/osxfuse/issues/281,"Hi, xxxlnbrkI connect to a linux fileserver with approx. 65 TB of free space. I am connecting to the fileserver using sshfs and osxfuse, Instead of seeing the 65 TB, I see only 10GB of free space. Finder uses the wrong amount to calculate if I can move files to the fileserver or not. xxxlnbrkThe problem is not the Mac or finder because using scp I can transfer the files. xxxlnbrkCheers Alexander","xxxtxt Hi, I connect to a linux fileserver with approx. 65 TB of free space. I am connecting to the fileserver using sshfs and osxfuse, Instead of seeing the 65 TB, I see only 10GB of free space. Finder uses the wrong amount to calculate if I can move files to the fileserver or not. The problem is not the Mac or finder because using scp I can transfer the files. Cheers Alexander"
93290,https://github.com/keras-team/keras/issues/10190,"Please make sure that the boxes below are checked before you submit your issue. If your issue is an implementation question, please ask your question on [StackOverflow](http://stackoverflow.com/questions/tagged/keras) or [join the Keras Slack channel](https://keras-slack-autojoin.herokuapp.com/) and ask there instead of filing a GitHub issue.xxxlnbrkThank you!xxxlnbrk- [ ] Check that you are u...","xxxtxt Please make sure that the boxes below are checked before you submit your issue. If your issue is an implementation question, please ask your question on StackOverflow xxxlnkhb stackoverflow.com xxxlnkhe or join the Keras Slack channel xxxlnkhb keras-slack-autojoin.herokuapp.com xxxlnkhe and ask there instead of filing a GitHub issue.\nThank you!\nxxxlistB [ ] Check that you are up-to-da..."
148781,https://github.com/beautifularea/tOpenSSL/issues/8,"HTTP/1.1 426 Upgrade Requiredxxxlnbrk Upgrade: TLS/1.0, HTTP/1.1xxxlnbrk Connection: Upgradexxxlnbrk","xxxcdb HTTP/1.1 426 Upgrade Required\n Upgrade: TLS/1.0, HTTP/1.1\n Connection: Upgrade\nxxxcde"
161180,https://github.com/IFSight/d4m-nfs/issues/11,"I started using d4m-nfs, but my scripts changing file ownership cannot run. I get:xxxlnbrkchown: changing ownership of 'filename_here': Operation not permittedxxxlnbrkHow can I solve that?","xxxtxt I started using d4m-nfs, but my scripts changing file ownership cannot run. I get:\nchown: changing ownership of 'filename_here': Operation not permitted\nHow can I solve that?"


### Playground To Inspect And Troubleshoot

Change `idx` value corresponding to dataframe above

In [None]:
idx = 29950

In [119]:
tst = sdf.ix[idx].clean_body
print(tst)

22:12:18 ==622==ERROR: AddressSanitizer: alloc-dealloc-mismatch (operator new [] vs operator delete) on 0x602000034030
22:12:19 ==622==ABORTING
xxxcde


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


In [102]:
re.sub(r'<.+?>.+?</.+?>|<[a-zA-Z]{1,}.*?>', 'xxxhtml', sdf.ix[3227].body, re.DOTALL)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


'xxxhtmlxxxhtmlxxxlnbrkxxxhtmlxxxlnbrkxxxhtmlxxxlnbrkxxxhtmlxxxlnbrkxxxhtmlxxxhtmlxxxlnbrkxxxhtmlxxxlnbrkxxxhtmlxxxlnbrkxxxhtmlxxxhtmlxxxlnbrkxxxhtmlxxxlnbrkxxxhtmlxxxlnbrkxxxhtmlxxxlnbrkxxxhtml xxxhtml'

In [103]:
tst = _

In [105]:
re.sub('(xxxhtml(xxxlnbrk)?(\s)?)+', 'xxxhtml', tst)

'xxxhtml'

# Notes

- handle @mentions
- would be nice to detect code that isnt inside \`\`\` code blocks
- handle ip addresses
- consider getting rid of `xxxtext`