# Easily export jupyter cells to python module
https://github.com/fastai/course-v3/blob/master/nbs/dl2/notebook2script.py

In [1]:
! python /tf/main/src/scripts/notebook2script.py prep.ipynb

Converted prep.ipynb to exp/nb_prep.py


In [None]:
! pip install sklearn

In [None]:
#export
import json

import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [None]:
pd.set_option('max_colwidth',300)
from pprint import pprint

In [None]:
data_path = Path('/tf/data/datasets')
java_files = sorted(Path(data_path/'mthds_cmts/java/').glob('**/*.gz'))
java_files

In [None]:
#export
def jsonl_df(file_list, columns, compression=None):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression=compression,
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [None]:
java_df = jsonl_df(java_files, ['code', 'docstring'], 'gzip').rename(columns={"code": "query", "docstring": "res"})

In [None]:
java_df.head(3)

In [None]:
len(java_df)

In [None]:
java_df.to_csv(data_path/"mthds_cmts/mthds_cmts.csv")

In [None]:
# concate two csv files
csvs = [data_path/'so_posts/1-50000_10-3-19.csv', data_path/'so_posts/50000-56942_10-3-19.csv']
df = pd.concat([pd.read_csv(f)
           for f in csvs], sort=False)

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df.to_csv(data_path/'so_posts/56942_10-3-19.csv', index=False)

In [None]:
df = pd.read_csv(data_path/"so_posts/56942_10-3-19_formated.csv")
df.head()

In [None]:
#beautifulsoup version
white_list = ["p"]

In [None]:
#export
def clean_html(post):
    """Removes all html tags that can occur inside a SO post."""
    result = re.sub(r"<.?span[^>]*>|<.?code[^>]*>|<.?p[^>]*>|<.?hr[^>]*>|<.?h[1-3][^>]*>|<.?a[^>]*>|<.?b[^>]*>|<.?blockquote[^>]*>|<.?del[^>]*>|<.?dd[^>]*>|<.?dl[^>]*>|<.?dt[^>]*>|<.?em[^>]*>|<.?i[^>]*>|<.?img[^>]*>|<.?kbd[^>]*>|<.?li[^>]*>|<.?ol[^>]*>|<.?pre[^>]*>|<.?s[^>]*>|<.?sup[^>]*>|<.?sub[^>]*>|<.?strong[^>]*>|<.?strike[^>]*>|<.?ul[^>]*>|<.?br[^>]*>", "", post)
    return result

In [None]:
#export
def post_df(df):
    """Formats Dataframe from scrapped SO posts into query and res columns."""
    query = list(map('\n\n'.join, zip(df["q_body"], df["title"])))
    query = list(map(clean_html, query))
    res   = list(map(clean_html, df["a_body"]))
    new_df = pd.DataFrame({"query": query, "res": res})
    return new_df

In [None]:
formated_df = post_df(df)
len(formated_df)

In [None]:
formated_df.to_csv(data_path/'so_posts/56942_10-3-19_formated.csv', index=False)

In [None]:
post = """<p>I have the following code:<hr>
<code>
/**
 * Generic version of the Box class.
 * @param <T> the type of the value being boxed
 */
public class Box<T> {
    // T stands for "Type"
    private T t;

    public void set(T t) { this.t = t; }
    public T get() { return t; }
}
</code>
</p>"""

In [None]:
post = """<p>I have the following code:<hr>
<code>
/**
 * Generic version of the Box class.
 * @param <T> the type of the value being boxed
 */
public class Box<T> {
    // T stands for "Type"
    private T t;

    public void set(T t) { this.t = t; }
    public T get() { return t; }
}
</code>
<a>              - hyperlink.
<b>              - bold, use as last resort <h1>-<h3>, <em>, and <strong> are 
                   preferred.
<blockquote>     - specifies a section that is quoted from another source.
<code>           - defines a piece of computer code.
<del>            - delete, used to indicate modifications.
<dd>             - describes the item in a <dl> description list.
<dl>             - description list.
<dt>             - title of an item in a <dl> description list.
<em>             - emphasized.
<h1>, <h2>, <h3> - headings.
<i>              - italic.
<img>            - specifies an image tag.
<kbd>            - represents user input (usually keyboard input).
<li>             - list item in an ordered list <ol> or an unordered list <ul>.
<ol>             - ordered list.
<p>              - paragraph.
<pre>            - pre-element displayed in a fixed width font and and 
                   unchanged line breaks.
<s>              - strikethrough.
<sup>            - superscript text appears 1/2 character above the baseline 
                   used for footnotes and other formatting.
<sub>            - subscript appears 1/2 character below the baseline.
<strong>         - defines important text.
<strike>         - strikethrough is deprecated, use <del> instead.
<ul>             - unordered list.
<br>             - line break.
<hr>             - defines a thematic change in the content, usually via a 
                   horizontal line.
</p>"""

In [None]:
import re
result = re.sub(r"<.?p[^>]*>|<.?hr[^>]*>|<.?h[1-3][^>]*>|<.?a[^>]*>|<.?b[^>]*>|<.?blockquote[^>]*>|<.?code[^>]*>|<.?del[^>]*>|<.?dd[^>]*>|<.?dl[^>]*>|<.?dt[^>]*>|<.?em[^>]*>|<.?i[^>]*>|<.?img[^>]*>|<.?kbd[^>]*>|<.?li[^>]*>|<.?ol[^>]*>|<.?pre[^>]*>|<.?s[^>]*>|<.?sup[^>]*>|<.?sub[^>]*>|<.?strong[^>]*>|<.?strike[^>]*>|<.?ul[^>]*>|<.?br[^>]*>", "", post)
print(result)

In [None]:
#export
def split_data(df):
    """Split DataFrame into training, validation, and testing sets."""
    df_trn, tmp = train_test_split(df, test_size = 0.2)
    df_val, df_tst = train_test_split(tmp, test_size = 0.5)
    
    return df_trn, df_val, df_tst

In [None]:
df = pd.read_csv(data_path/"so_posts/56942_10-3-19_formated.csv")
df.head()

In [None]:
df_trn, df_val, df_tst = split_data(df)

In [None]:
assert len(df) == (len(df_trn) + len(df_val) + len(df_tst))

In [None]:
#save results
df_trn.to_csv(data_path/'so_posts/trn.csv', index=False)
df_val.to_csv(data_path/'so_posts/val.csv', index=False)
df_tst.to_csv(data_path/'so_posts/tst.csv', index=False)

In [None]:
df_trn.head()

In [None]:
df_val.head()

In [None]:
df_tst.head()

In [None]:
#export
def save_splits(dfs, output_path):
    """Save split of DataFrames into corresponding training, validation, and testing csv files."""
    dfs[0].to_csv(output_path/'trn.csv', index=False)
    dfs[1].to_csv(output_path/'val.csv', index=False)
    dfs[2].to_csv(output_path/'tst.csv', index=False)

In [None]:
#export
def save_dfs(df, output_path):
    """Splits and saves a list of DataFrames to corresponding csv files in the output_path."""
    # split
    df_trn, df_val, df_tst = split_data(df)
    # Make sure the sizes match
    assert len(df) == (len(df_trn) + len(df_val) + len(df_tst))
    
    # save splits
    save_splits((df_trn, df_val, df_tst), output_path)

In [None]:
save_dfs(java_df, data_path/"mthds_cmts")

In [None]:
type(java_df["query"])

In [None]:
#export
tags = {"mthds_cmts": "<$comment$>", "so_posts": "<$qa$>", "code_smell": "<$dirty$>"}
def tag_task(df, task_name):
    """Adds special tag to the end of the query based on the type of task."""
    new_query = list(map(lambda x: x + tags[task_name], df["query"]))
    df["query"] = pd.Series(new_query)

    return df

In [None]:
#export
def read_data(path):
    """Read in the different data splits from some path."""
    df_trn = pd.read_csv(path/"trn.csv")
    df_val = pd.read_csv(path/"val.csv")
    df_tst = pd.read_csv(path/"tst.csv")
    
    return df_trn, df_val, df_tst

In [None]:
#export
def tag_tasks(path):
    """Tag all tasks that exist in some path."""
    dfs = []
    for task_path in path.glob("*"):
        if task_path.stem == "merged": continue
        dfs.append(
            list(
                map(lambda x: tag_task(x, task_path.stem), read_data(task_path))
            )
        )
    
    return dfs

In [None]:
dfs = tag_tasks(data_path)
print(dfs[0][0]["query"][0], dfs[1][0]["query"][0])

In [None]:
trn_dfs = next(zip(*dfs))
print(len(trn_dfs[0]), len(trn_dfs[1]))

In [None]:
#export
def merge_dfs(path, output):
    """Tag and merge tasks into a single DataFrame."""
    dfs = tag_tasks(path)
    merged_dfs = list(map(lambda x: pd.concat(x, ignore_index=True), zip(*dfs)))
    save_splits(merged_dfs, output)
    
    return merged_dfs

In [None]:
merged_dfs = merge_dfs(data_path, data_path/"merged")
# merged_dfs[0]["query"][0]

In [None]:
tst = [("trn1", "tst1"), ("trn2", "tst2"), ("trn3", "tst3")]

list(zip(*tst))

In [None]:
merged_dfs[0].head()

In [None]:
print(merged_dfs[0]["query"][50000])

In [None]:
len(merged_dfs[0])