# -1: Environment Setup

### Install Dependencies

In [1]:
!python -VVV

Python 3.9.13 | packaged by conda-forge | (main, May 27 2022, 16:58:50) 
[GCC 10.3.0]


In [2]:
!pip install pandas~=1.4.3 numpy~=1.23.1 nltk~=3.7 gensim~=4.2.0 textblob~=0.17.1 scikit-learn~=1.1.1 hypy_utils~=1.0.10 jsonpickle~=2.2.0 requests~=2.28.1



In [3]:
from __future__ import annotations

import json
import os
import shlex
import tempfile
import warnings
import xml.etree.ElementTree as ET
from datetime import datetime
from pathlib import Path
from subprocess import check_output, check_call
from typing import NamedTuple

import dateutil.parser
import nltk
import pandas as pd
from hypy_utils import json_stringify
from nltk.corpus import stopwords

warnings.filterwarnings('ignore')

In [4]:
apps: dict[str, str] = json.loads(Path('selected_apps.json').read_text())
print('Number of selected apps/repos:', len(apps))

Number of selected apps/repos: 612


In [5]:
DIR_GITHUB = Path('github')
DIR_REVIEWS = Path('reviews')

DIR_REPO_ISSUES =         lambda repo: DIR_GITHUB / 'repos' / repo / 'issues.json'
DIR_REPO_ISSUE_COMMENTS = lambda repo, issue: DIR_GITHUB / 'repos' / repo / 'issues' / str(issue) / 'comments.json'
DIR_REPO_COMMITS =        lambda repo: DIR_GITHUB / 'commits' / repo / 'commits.json'
DIR_REPO_STRINGS_XML =    lambda repo: DIR_GITHUB / 'source' / 'strings.xml' / f"{repo.replace('/', '-')}.xml"
DIR_REPO_LOC =            lambda repo: DIR_GITHUB / 'source' / 'loc' / f"{repo.replace('/', '-')}.txt"

DIR_APP_REVIEWS =         lambda pkg: DIR_REVIEWS / 'play_store' / f'{pkg}.json'

# 2. Data Collection, Cleaning, Preprocessing

### 2.1. Issue Extractor (Cleaning Existing Data)

In [6]:
import shutil


class ExtractedIssue(NamedTuple):
    id: int
    title: str
    desc: str
    author: str

    # Dates
    opened: str
    modified: str
    closed: str | None

    comments: list[ExtractedComment]


class ExtractedComment(NamedTuple):
    issue: int
    author: str
    date: str
    text: str


def issue_extract(repo: str) -> list[ExtractedIssue]:
    """
    From raw GitHub data, collect title, description, author, open/close time, and comments of issues

    :param repo: GitHub repo in the format of "owner/repo"
    """
    issues = json.loads(DIR_REPO_ISSUES(repo).read_text('utf-8'))
    comments = {i['number']: DIR_REPO_ISSUE_COMMENTS(repo, i['number']) for i in issues}
    comments = {id: json.loads(path.read_text('utf-8')) if path.is_file() else [] for id, path in comments.items()}
    comments = {id: [ExtractedComment(id, c['user']['login'], c['created_at'], c['body']) for c in cs] for id, cs in comments.items()}
    issues = [ExtractedIssue(int(i['number']), i['title'], i['body'], i['user']['login'], i['created_at'], i['updated_at'], i['closed_at'], comments[i['number']])
              for i in issues]
    return issues


# Test issue extractor
pd.DataFrame(issue_extract('zxing/zxing'))[:5]

Unnamed: 0,id,title,desc,author,opened,modified,closed,comments
0,1513,A new release soon?,"Hello,\r\nFirst of all, thanks for this great ...",Altonss,2022-04-14T18:40:50Z,2022-04-16T07:19:26Z,,"[(1513, srowen, 2022-04-14T18:52:01Z, @AlexGel..."
1,1508,Share the ECI string builder of the data matri...,"Besides the benefit of code reuse, the shared ...",AlexGeller1,2022-03-11T08:28:59Z,2022-03-13T14:40:09Z,2022-03-13T14:40:09Z,[]
2,1507,Add multi-eci decoding for PDF417,Also fixed issue that some multi-eci encoded P...,AlexGeller1,2022-03-07T21:23:59Z,2022-03-09T14:24:37Z,2022-03-09T14:24:37Z,[]
3,1506,Add support for multi-eci encoding for PDF417,By default multi-ECI encoding is disabled so t...,AlexGeller1,2022-03-04T14:35:02Z,2022-03-05T14:07:47Z,2022-03-05T14:07:37Z,[]
4,1505,Fatal Exception: java.lang.ArrayIndexOutOfBoun...,Fatal Exception: java.lang.ArrayIndexOutOfBoun...,RajnishGowreesunker,2022-03-03T10:18:37Z,2022-04-12T10:40:53Z,2022-03-03T13:07:23Z,"[(1505, thestinger, 2022-04-12T10:40:53Z, I th..."


### 2.2. Commit Extractor (Gathering New Data)

Also collects strings.xml

In [7]:
class ExtractedCommit(NamedTuple):
    sha: str
    author: str
    email: str
    time: str
    message: str
    file_names: list[str]

    def get_time(self) -> datetime:
        return dateutil.parser.isoparse(self.time)


def git_log(path: Path) -> list[ExtractedCommit]:
    """
    Call and parse git log. This function requires that git>=2.37.1 is installed on your system.

    :param path: Path of git repository
    :return: List of commits
    """
    # check_call(shlex.split('git config diff.renames 0'))
    cmd = f"git -c 'diff.renamelimit=0' -c 'diff.renames=0' -C '{path.absolute()}' log --name-status --diff-filter=AMD --pretty=format:'START_COMMIT_QwQ %H%n%aN%n%aE%n%aI%n%s%n'"
    log = check_output(shlex.split(cmd)).decode('utf-8', 'ignore')

    def extract_commit(block: str) -> ExtractedCommit:
        try:
            lines = block.split('\n')
            sha, author, email, date, message = lines + [""] if len(lines) == 4 else lines[:5]
            files = [f.replace('\t', '/') for f in lines[6:]]
            return ExtractedCommit(sha, author, email, date, message, files)
        except Exception as e:
            print(f'========== Commit Extract Error {e} ==========\n{block}\n==========')

    return [extract_commit(c.strip()) for c in log.split('START_COMMIT_QwQ') if c]


def find(name: str, path: Path) -> Path:
    for root, dirs, files in os.walk(path):
        if name in files:
            return Path(root, name)


def commit_collect(repo: str, verbose: bool = False) -> list[ExtractedCommit]:
    """
    Extract commits from GitHub repo, collect sha, time, file names, author, message

    Also extracts strings.xml for GUI terms exclusion.
    Also counts the number of lines of java code.

    :param repo:
    :param verbose:
    :return:
    """
    out_path = DIR_REPO_COMMITS(repo)
    strings_out = DIR_REPO_STRINGS_XML(repo)
    strings_out.parent.mkdir(parents=True, exist_ok=True)
    loc_out = DIR_REPO_LOC(repo)
    loc_out.parent.mkdir(parents=True, exist_ok=True)

    if out_path.is_file():
        return [ExtractedCommit(*d) for d in json.loads(out_path.read_text('utf-8'))]

    with tempfile.TemporaryDirectory() as tmp:
        tmp = Path(tmp)

        # tmp = Path(f'/tmp/github/{repo}')
        # tmp.mkdir(parents=True, exist_ok=True)

        # Clone repository
        if verbose:
            print(f'Cloning {repo} into {tmp}')
        if not (tmp / '.git').is_dir():
            check_call(shlex.split(f'git clone --quiet "https://github.com/{repo}" "{tmp}"'))

        # Extract strings.xml
        strings = find('strings.xml', tmp)
        if strings is not None:
            shutil.copyfile(strings, strings_out)

        # Count lines of java code
        java_files = [Path(root, f) for root, dirs, files in os.walk(tmp) for f in files if f.endswith('java')]
        loc = sum(f.read_bytes().decode('utf-8', 'ignore').count('\n') for f in java_files)
        loc_out.write_text(str(loc))

        # Get git log to json
        commits = git_log(tmp)
        out_path.parent.mkdir(parents=True, exist_ok=True)
        out_path.write_text(json_stringify(commits), 'utf-8')
        return commits


# Test commit extract
pd.DataFrame(commit_collect('zxing/zxing'))[:5]

Unnamed: 0,sha,author,email,time,message,file_names
0,45df47022794eedcffd7e4409200100be5b8e631,Sean Owen,srowen@gmail.com,2022-07-20T13:29:55-05:00,"Add basic support for thread interruption, and...",[M/core/src/main/java/com/google/zxing/MultiFo...
1,2369986a0d97d626c2e453b7e3e8a2d2a6a04cff,Daniel Gredler,daniel.gredler@gmail.com,2022-07-19T19:36:01-04:00,Add description for OkapiBarcode project (#1545),[M/README.md]
2,8ecde0088f521579616b7ec6ebad5aeb3eaa03b2,Daniel Gredler,daniel.gredler@gmail.com,2022-07-19T19:12:52-04:00,Fix Maxicode decoding of CR (carriage return);...,[M/core/src/main/java/com/google/zxing/maxicod...
3,5231442ebc14ae7f1eb6d916983705b682d1bf96,Sean Owen,srowen@gmail.com,2022-07-06T13:16:34-05:00,Misc dependency updates,"[M/core/pom.xml, M/core/src/main/java/com/goog..."
4,78c2a9c4401e166be22ba1483cc1c7d090bdf8b8,Sean Owen,srowen@gmail.com,2022-07-05T18:34:45-05:00,"In multi QR code finder, only consider multipl...",[M/core/src/main/java/com/google/zxing/multi/q...


### 2.3. Download Commits for All Apps

In [8]:
from hypy_utils.tqdm_utils import pmap

_ = pmap(commit_collect, apps.values(), desc='Caching commits...')

Caching commits...: 100%|██████████| 612/612 [00:11<00:00, 51.42it/s]


### 2.4. Preprocessing Commits & Issues

In [9]:
import string
from hypy_utils.nlp_utils import camel_split
from nltk.stem import PorterStemmer


STOPWORDS = {w.lower() for w in set(Path('data/stopwords.txt').read_text('utf-8').splitlines())\
    .union(stopwords.words('english'))}
STEMMER = PorterStemmer()


def gui_excluded_stopwords(repo: str) -> set[str]:
    """
    Get the set of stop words excluding GUI stop words

    :param repo:
    :return:
    """
    strings_xml = DIR_REPO_STRINGS_XML(repo)

    if not strings_xml.is_file():
        return STOPWORDS

    gui_words = {w.lower() for el in ET.parse(strings_xml).getroot().iter('string') for w in el.attrib['name'].split('_')}
    return STOPWORDS - gui_words


def process_text(text: str, stop: set[str]) -> str:
    """
    Camel case splitting, underscore splitting, stopword removal, stem

    :param text: Original text
    :param stop: Stopword set to use
    :return: Processed text
    """
    # Remove links
    text = ' '.join(t for t in text.split(' ') if not (t.startswith('http://') or t.startswith('https://')))

    # Remove punctuations (replace punctuations with spaces)
    text = ''.join(c if c not in string.punctuation else ' ' for c in text)
    while '  ' in text:
        text = text.replace('  ', ' ')

    # Tokenize
    text = nltk.word_tokenize(text)

    # Camel split
    text = [s for w in text for s in camel_split(w)]

    # Remove stopwords and stem
    text = [STEMMER.stem(w) for w in text if w not in stop]

    # Join
    return ' '.join(text)


def preprocess_issues(repo: str) -> dict[int, str]:
    """
    Preprocess issues for repo

    :param repo:
    :return: dict[issue_id] = preprocessed issue text
    """
    issues: list[ExtractedIssue] = issue_extract(repo)

    # Combine title and body
    issues: dict[int, str] = {i.id: i.title + ' ' + i.desc for i in issues}

    # Process text
    issues = {id: process_text(text, gui_excluded_stopwords(repo)) for id, text in issues.items()}

    return issues


def preprocess_commits(repo: str) -> dict[str, str]:
    """
    Preprocess commits for repo

    :param repo:
    :return: dict[commit_hash] = preprocessed commit text
    """
    commits: list[ExtractedCommit] = commit_collect(repo)

    # Combine commit message with names modified (without extensions)
    commits: dict[str, str] = {c.sha: c.message + ' ' + '  '.join(Path(f).stem for f in c.file_names) for c in commits}

    # Process text
    commits = {sha: process_text(text, gui_excluded_stopwords(repo)) for sha, text in commits.items()}

    return commits


pd.DataFrame(preprocess_commits('zxing/zxing').items(), columns=['sha', 'processed'])[:10]

Unnamed: 0,sha,processed
0,45df47022794eedcffd7e4409200100be5b8e631,add basic interrupt add protect run request we...
1,2369986a0d97d626c2e453b7e3e8a2d2a6a04cff,add descript okapi barcod project 1545 readm
2,8ecde0088f521579616b7ec6ebad5aeb3eaa03b2,fix maxicod decod cr carriag fix 1543 1544 dec...
3,5231442ebc14ae7f1eb6d916983705b682d1bf96,misc depend updat pom bit matrix eci string bu...
4,78c2a9c4401e166be22ba1483cc1c7d090bdf8b8,in multi qr code finder consid multipli center...
5,75dbbb00dd10a5f3683553e1d56b8209b9db48e2,micro optim to help high level encod
6,78faea80583e6fc903bdb8c146e8ff499b60c307,prevent encod empti pdf417 1523 pd f417 high l...
7,c7a7b30f0497e01bed97fd465e4645594ee35fda,add orient pdf417 barcod to result metadata 15...
8,83cdc82aa15feb819f7ee4906d473874fbf4c93a,updat queri mobil web page add integr to sure ...
9,3aa3832840af45083a10ee6a9247c4f82988d120,readm md descript nu book 1521 readm


### 2.5. Load Reviews Data

In [10]:
class ExtractedReview(NamedTuple):
    id: str
    score: int
    text: str
    version: str
    date: str

    # TODO: Run AR-Miner on new data
    # is_informative: bool


def load_reviews(app: str) -> list[ExtractedReview]:
    """
    Load reviews of an app

    :param app: App package name
    :return: Reviews
    """
    return [ExtractedReview(*[r[k] for k in ['id', 'score', 'text', 'version', 'date']])
            for r in json.loads(DIR_APP_REVIEWS(app).read_text())]


pd.DataFrame(load_reviews('a2dp.Vol'))[:10]

Unnamed: 0,id,score,text,version,date
0,gp:AOqpTOFZ0qo77Q1UDtsJTsz1Ews-2yeQ5HPYwya9BQJ...,5,I Love this app! Love it!,2.13.0.4,2021-11-24T01:22:28.905Z
1,gp:AOqpTOEVnCNHAHXf66-iUzhXvkZXCClV-cUtEOyc535...,5,LsabethDixon Save,2.13.0.4,2021-10-21T16:47:37.307Z
2,gp:AOqpTOGuq2gEmLXVMViXdws7HyYVSh3VgCnSutnggup...,1,Garbage,,2021-08-09T07:29:27.390Z
3,gp:AOqpTOEGmWHw6FP1w4z4u1m9NiEaHhIgrh2LRxTJP6U...,5,Best,2.12.9.3,2021-04-15T10:57:46.540Z
4,gp:AOqpTOHUpUu7Df5TGUTbos5RIKIGoUciBOsMd9tu-44...,5,It works on my smartphone for improving latency.,2.13.0.4,2021-04-07T12:41:52.371Z
5,gp:AOqpTOHIL64lE1_pMzn0WtaJ0AMJawMbB-5Cc7foR09...,1,Idk but its not working on my realme 7 pro,2.13.0.4,2021-02-19T12:38:57.502Z
6,gp:AOqpTOGYFN3_JsrKBYPLfE3sc9mZVyviuMu_TKi-W8N...,5,Great for car / headphone pairing. Also reads ...,2.13.0.4,2021-02-16T12:20:19.290Z
7,gp:AOqpTOHo2avDn8q8wnCEeIFZyUPO8uxVxC9QcAQhbPm...,5,App is good for my stupid MIUI12 not memorizin...,,2020-12-20T21:23:43.183Z
8,gp:AOqpTOEh_a5zHOJYkBLW0DJBimNsWqR7diSEPoj1eUE...,1,Notthing worked,,2020-12-01T07:43:25.731Z
9,gp:AOqpTOGb1oR5k0eALLU2Ortql_ZN9umHIapu0IvnMfb...,1,"Does nothing. Unfriendly UI.No info, no help, ...",2.13.0.4,2020-11-09T20:30:29.165Z


# 3. Link Identifier

## 3.1. Link Issue/Commit to Reviews

Using Asymmetric Dice similarity

In [None]:
SIMILARITY_THRESHOLD = 0.3


def link_issues_to_reviews(app: str):
    """
    Link issues to reviews

    :param app: App package name
    :return:
    """
    repo = apps[app]

    reviews, issues = load_reviews(app), issue_extract(repo)

    # Loop through all reviews
    for review in reviews:
        # Filter relevant issues
        candidates = [i for i in issues if i.opened > ]



## 3.2. Linking Commits to Issues

Using the ReLink approach by Wu et al. (2011)

For this, we need to setup Java

In [11]:
!java --version

openjdk 11.0.16 2022-07-19
OpenJDK Runtime Environment (build 11.0.16+8)
OpenJDK 64-Bit Server VM (build 11.0.16+8, mixed mode)


In [None]:
RELINK_BIN = Path('./bin/relink')


def timestamp(iso_date: str) -> int:
    """
    Convert iso date string to java milliseconds timestamp for ReLink
    """
    return int(dateutil.parser.isoparse(iso_date).timestamp() * 1000)


def to_tsv(data: list[list[str]]) -> str:
    """
    Convert list of tuples to ReLink readable tab-separated values text format
    """
    return '\n'.join(['\t'.join([str(v).replace('\t', ' ').replace('\r', ' ').replace('\n', ' ') for v in tup]) for tup in data])


def relink_write_bug_data(repo: str, bug_data_path: Path, bug_comments_path: Path):
    """
    Generate ReLink's BugData.txt (issues) input file and BugCommentData.txt (issue comments) input file.

    As stated in ReLink's readme file:

    > Bug-data file provides the basic bug information.
    > The format of the file is: bugID, Type Status, Owner, Reporter, ReportDate, ModifiedDate, LastDate, Summary, Comments
    > Bug-comment file provides the comments from the developers and the users.
    > The format of the file is: bugID, developer, date, comment
    """
    issues = issue_extract(repo)
    fields = [(i.id, 'Defect', 'Fixed', i.author, i.author, timestamp(i.opened), timestamp(i.modified), timestamp(i.closed), i.title, i.desc, '',
               *(c.author for c in i.comments))
              for i in issues if i.closed]
    bug_data_path.write_text(to_tsv(fields))
    comments = [(i.id, c.author, timestamp(c.date), c.text) for i in issues for c in i.comments if i.closed]
    bug_comments_path.write_text(to_tsv(comments))


def relink_write_changelogs(repo: str, changelog_path: Path):
    """
    Generate ReLink's changeLogs.txt (commits) input file.

    As stated in ReLink's readme file:

    > Change-log file provides the basic change log information.
    > The format of the file is: revisionNo, date, Author, Message

    However, in the example file, the format is: revisionNo, date, author, message + files, message
    """
    commits = commit_collect(repo)

    def transform_file(file: str):
        for short, full in [('A/', 'Added : '), ('M/', 'Modified : '), ('D/', 'Deleted : ')]:
            if file.startswith(short):
                return f'{full}{file[len(short):]}'
        print(f'Error parsing {file}')
        return file

    fields = [(i, timestamp(c.time), c.author, c.message + ' ' + ' '.join(transform_file(f) for f in c.file_names), c.message)
              for i, c in enumerate(commits)]
    changelog_path.write_text(to_tsv(fields))


def relink_run(repo: str):
    with tempfile.TemporaryDirectory() as tmp:
        tmp = Path(tmp)

        tmp = Path(f'/tmp/relink/{repo}')
        tmp.mkdir(parents=True, exist_ok=True)

        out_path = Path(f'process/relink/{repo}').absolute()
        out_path.mkdir(parents=True, exist_ok=True)
        (out_path / "links.txt").write_text('')

        # Create data files
        bug_data = tmp / 'BugData.txt'
        bug_comment = tmp / 'BugCommentData.txt'
        relink_write_bug_data(repo, bug_data, bug_comment)

        changelog = tmp / 'changeLogs.txt'
        relink_write_changelogs(repo, changelog)

        # Run relink
        cmd = f'java -jar "{tmp}/relink/ReLink.jar" {bug_data} {bug_comment} {changelog} {out_path / "index"} -g {out_path / "links.txt"}'
        print(cmd)
        check_call(shlex.split(f'ln -sf {RELINK_BIN.absolute()} {tmp}/relink'), cwd=tmp)
        check_call(shlex.split(cmd), cwd=tmp / 'relink')


relink_run('zxing/zxing')

java -jar "/tmp/relink/zxing/zxing/relink/ReLink.jar" /tmp/relink/zxing/zxing/BugData.txt /tmp/relink/zxing/zxing/BugCommentData.txt /tmp/relink/zxing/zxing/changeLogs.txt /workspace/P4/process/relink/zxing/zxing/index -g /workspace/P4/process/relink/zxing/zxing/links.txt
ReLink 1.0 is Copyright  2010-2012 - Tsinghua University.
The process has begun. It may cost a few hours to finish.
Please wait...
Indexing...
The file /workspace/P4/process/relink/zxing/zxing/index/directory_vector/bug_1 has been parsed.
The file /workspace/P4/process/relink/zxing/zxing/index/directory_vector/bug_2 has been parsed.
The file /workspace/P4/process/relink/zxing/zxing/index/directory_vector/bug_3 has been parsed.
The file /workspace/P4/process/relink/zxing/zxing/index/directory_vector/bug_4 has been parsed.
The file /workspace/P4/process/relink/zxing/zxing/index/directory_vector/bug_5 has been parsed.
The file /workspace/P4/process/relink/zxing/zxing/index/directory_vector/bug_6 has been parsed.
The file

# RQ1 - CRISTAL Linking Accuracy

### Table Generation

#### Table 1: List of apps used for manual evaluation

In [92]:
from hypy_utils.tqdm_utils import tq
import io
import requests
import zipfile


def get_f_droid_index() -> dict[str, dict]:
    """
    Get F-Droid Index
    """
    buf = requests.get('https://f-droid.org/repo/index-v1.jar').content
    j = zipfile.ZipFile(io.BytesIO(buf)).read('index-v1.json')
    lst = json.loads(j)['apps']
    return {a['packageName']: a for a in lst}


def table_1(apps: list[tuple[str, str]]) -> pd.DataFrame:
    """
    Table 1: List of apps used for manual evaluation

    Includes name, version, kloc, number of reviews, number of commits, number of issues

    :return: Table
    """
    fd = get_f_droid_index()
    apps = [(pkg, fd[pkg], repo) for pkg, repo in apps if pkg in fd]
    table = [{
        'name': next(iter(l['name'] for l in a['localized'].values() if 'name' in l), pkg),
        'version': a['suggestedVersionName'],
        'loc': DIR_REPO_LOC(repo).read_text(),
        'reviews': len(json.loads(DIR_APP_REVIEWS(pkg).read_text())),
        'commits': len(commit_collect(repo)),
        'issues': len(json.loads(DIR_REPO_ISSUES(repo).read_text()))
    } for pkg, a, repo in tq(apps, 'Generating table 1...')]
    return pd.DataFrame(table)


table_1(list(apps.items())[:20])

Generating table 1...: 100%|██████████| 20/20 [00:00<00:00, 61.34it/s] 


Unnamed: 0,name,version,loc,reviews,commits,issues
0,1010! Klooni,0.8.6,5420,12,204,79
1,/system/app mover,1.7.2,865,930,69,20
2,"10,000 sentences",0.3.4,6843,67,494,24
3,"1List - Simple Lists for TODO, Shopping, Movie...",1.3.1,0,11,28,42
4,2048,2.2,177,844,92,94
5,info.staticfree.android.twentyfourhour,0.4.2,1606,146,93,13
6,36C3 Schedule,1.54.0,2790,45,2725,472
7,3D Model Viewer,3.2.0,25044,61,158,204
8,A2DP Volume,2.13.0.4,9356,294,357,301
9,Aard 2,0.53,5508,71,429,140


In [115]:
all_candidate_links = 0
issues_total = 0
commits_total = 0
reviews_total = 0
links_total = 0

# iterate though our 10 app dirs
for dir_cnt, dir in enumerate(os.listdir('.')):
    if dir == 'test' or dir == 'stopwords.txt': continue
    # if dir_cnt == 1: break
    # dir = 'ics-openconnect'
    print('\n\nProject:', dir)

    # open strings.xml file and get gui keywords
    gui_keywords = []
    tree = None
    try:
        tree = ET.parse(dir+'/strings.xml')
    except:
        print('Cannot open strings.xml!')
    root = tree.getroot()
    for el in root.iter('string'):
        # print(el.attrib['name'])
        for gui in el.attrib['name'].split('_'):
            if gui not in gui_keywords:
                gui_keywords.append(gui)

    # discarding gui keywords in stopwords
    stop = []
    for sw in STOPWORDS:
        if sw not in gui_keywords:
            stop.append(sw)

    # open each files [review, issue, commit, commit_files]
    try:
        issue_df = pd.read_csv(os.path.join(dir, dir + '_issue.csv'), encoding = 'ISO-8859-1')
    except:
        print('Cannot open', dir+'_issue', '!!!')
    try:
        commit_df = pd.read_csv(os.path.join(dir, dir + '_commit.csv'), encoding = 'utf-8-sig', sep='@@@', engine='python')
    except:
        print('Cannot open', dir+'_commit', '!!!')
    try:
        review_df = pd.read_csv(os.path.join(dir, dir + '_review.csv'), encoding = 'ISO-8859-1')
    except:
        print('Cannot open', dir+'_review', '!!!')
    try:
        commit_files_df = pd.read_csv(os.path.join(dir, dir + '_commit_files.csv'), encoding = 'ISO-8859-1')
    except:
        print('Cannot open', dir+'commit_files !!!')

    review_df = review_df.loc[review_df['Class'] == "[{'term':'Informative'}]"]

    # remove stop words
    issue_df['Issue Name'] = issue_df['Issue Name'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    issue_df['Issue Description'] = issue_df['Issue Description'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))
    commit_df['Commit Message'] = commit_df['Commit Message'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    review_df['User Review'] = review_df['User Review'].apply(lambda x: ' '.join([word for word in str(x).split() if word not in (stop)]))

    # normalize date format
    issue_df['Date Opened'] = pd.to_datetime(issue_df['Date Opened'], errors='coerce')
    issue_df['Date Closed'] = pd.to_datetime(issue_df['Date Closed'], errors='coerce')
    review_df['Date'] = pd.to_datetime(review_df['Date'], errors='coerce', utc=True)
    commit_df['Date'] = pd.to_datetime(commit_df['Date'], errors='coerce', utc=True)
    review_df['Date'] = review_df['Date'].dt.date
    commit_df['Date'] = commit_df['Date'].dt.date

    # Add file names to commit message
    commit_files_df['CommitID'] = commit_files_df['CommitID'].str[:7]
    for i, commit_file in commit_files_df.iterrows():
        for j, commit in commit_df.iterrows():
            if commit_file['CommitID'] == commit['Commit ID']:
                file_list = []
                for each_file in commit_file[' List of files'].split():
                    file_name = each_file.split('/')[-1]
                    try:
                        file_name = file_name.split('.')[-2].casefold()
                    except:
                        file_name = file_name.casefold()
                    file_list.append(file_name)
                commit['Commit Message'] += ' '.join(file_list)


    reviews_total += review_df.shape[0]
    issues_total += issue_df.shape[0]
    commits_total += commit_df.shape[0]
    print('informative reviews:', review_df.shape)
    print('issues:', issue_df.shape)
    print('commits:', commit_df.shape)


    # ISSUE and COMMIT EXTRACTOR
    # link candidate issues and commit to reviews (posted after review)
    review_and_issue = []
    review_and_commit = []
    for i, review in review_df.iterrows():
        for j, issue in issue_df.iterrows():
            if review['Date'] < issue['Date Opened']:
                review_and_issue.append([review, issue])
        for j, commit in commit_df.iterrows():
            if review['Date'] < commit['Date']:
                review_and_commit.append([review, commit])

    print('ISSUE EXTRACTOR:', len(review_and_issue))
    print('COMMIT EXTRACTOR:', len(review_and_commit))
    print()

    # LINK IDENTIFIER

    # links issues to reviews wrt similarity score
    threshold = 0.3
    r_i_header = review_and_issue[0][0].to_frame().T.columns.to_list() + review_and_issue[0][1].to_frame().T.columns.to_list()
    temp = []
    for i, r_i in enumerate(review_and_issue):
        a_review = r_i[0]['User Review'].split()
        a_review = [x.casefold() for x in a_review]
        a_issue = r_i[1]['Issue Name'].split() + str(r_i[1]['Issue Description']).split()
        a_issue = [x.casefold() for x in a_issue]

        if len(a_review) == 0 or len(a_issue) == 0:
            continue
        text_sim = len(set(a_review).intersection(set(a_issue))) / min(len(a_review), len(a_issue))

        # compute GUI bonus
        gui_ir = []
        for i_r in a_review:
            if i_r in gui_keywords:
                gui_ir.append(i_r)
        gui_is = []
        for i_s in a_issue:
            if i_s in gui_keywords:
                gui_is.append(i_s)
        union = len(set(a_review).union(set(a_issue)))
        gui_bonus = len(set(gui_ir).intersection(set(gui_is))) / union

        final_sim = text_sim * 0.5 + gui_bonus * 0.5
        if final_sim > threshold and len(a_review) > 3:
            link = list(pd.concat([r_i[0], r_i[1]]))
            temp.append(link)
            print(link)
            links_total += 1
        all_candidate_links += 1
    review_issue_linked = pd.DataFrame(temp, columns=r_i_header)


    # links commits to reviews
    r_c_header = review_and_commit[0][0].to_frame().T.columns.to_list() + review_and_commit[0][1].to_frame().T.columns.to_list()
    temp = []
    for i, r_c in enumerate(review_and_commit):
        a_review = r_c[0]['User Review'].split()
        a_review = [x.casefold() for x in a_review]
        a_commit = r_c[1]['Commit Message'].split()
        a_commit = [x.casefold() for x in a_commit]

        if len(a_review) == 0 or len(a_issue) == 0:
            continue
        text_sim = len(set(a_review).intersection(set(a_commit))) / min(len(a_review), len(a_commit))

        # compute GUI bonus
        gui_ir = []
        for i_r in a_review:
            if i_r in gui_keywords:
                gui_ir.append(i_r)
        gui_ic = []
        for i_c in a_commit:
            if i_c in gui_keywords:
                gui_ic.append(i_c)
        union = len(set(a_review).union(set(a_commit)))
        gui_bonus = len(set(gui_ir).intersection(set(gui_ic))) / union

        # compute the final similarity score
        final_sim = text_sim * 0.5 + gui_bonus  * 0.5
        if final_sim > threshold and len(a_review) > 3:
            link = list(pd.concat([r_c[0], r_c[1]]))
            temp.append(link)
            print(link)
            links_total += 1
        all_candidate_links += 1
    review_commit_linked = pd.DataFrame(temp, columns=r_c_header)
    review_commit_linked.to_csv('../results/'+dir+'_review2commit.csv')
    review_issue_linked.to_csv('../results/'+dir+'_review2issue.csv')


print('\n\nall_candidate_links', all_candidate_links)
print('total reviews:', reviews_total)
print('total issues:', issues_total)
print('total commits:', commits_total)
print('total links:', links_total)




Project: ca.rmen.android.frenchcalendar
informative reviews: (23, 5)
issues: (30, 8)
commits: (345, 5)
ISSUE EXTRACTOR: 392
COMMIT EXTRACTOR: 3532



Project: ics-openconnect
informative reviews: (216, 5)
issues: (64, 8)
commits: (935, 5)
ISSUE EXTRACTOR: 12686
COMMIT EXTRACTOR: 6505

['Connect with office VPN', "[{'term':'Informative'}]", 0.5, 5, datetime.date(2014, 2, 21), 23, 'Cannot connect to non-standard HTTPS', Timestamp('2017-09-07 00:00:00'), nan, 'peloy', Timestamp('2019-06-05 00:00:00'), 'dlenski', 'title: Cannot connect to non-standard HTTPS state: CLOSED author: peloy labels: comments: 7 assignees: projects: milestone: number: 23 -- My Cisco ASA configured to terminate SSL VPN connection 4443. Android OpenConnect (latest version from the Google Play store) not able to connect. The OpenConnect log the screenshot mentioned the ASA apparently returning the message "Invalid host entry. Please re-enter". OpenConnect 7.08 running Linux able to connect with no issues. On the AS

In [116]:
issue_df.head()

Unnamed: 0,Issue Number,Issue Name,Date Opened,Version,Issue Author,Date Closed,Closed By,Issue Description
0,1,[Feature] Profiles,2014-05-30,2.0,corcoran,NaT,Not closed,title: [Feature] Profiles state: OPEN author: ...
1,2,[Feature] Custom icon packs,2014-06-12,2.1,corcoran,2014-07-16,corcoran,title: [Feature] Custom icon packs state: CLOS...
2,3,[Feature] Time periods in-app,2014-06-16,2.1,corcoran,NaT,Not closed,title: [Feature] Time periods in-app state: OP...
3,4,[Feature] Pinned apps,2014-07-16,2.2,corcoran,NaT,Not closed,title: [Feature] Pinned apps state: CLOSED aut...
4,5,[Bug] Option to download icon pack disappears,2014-07-23,2.2,DarkDrek,NaT,Not closed,title: [Bug] Option to download icon pack disa...


In [None]:
review_df.head()

Unnamed: 0,User Review,Class,Version,Rating,Date
0,Wasnt I I downloaded pretty cool. I replace no...,[{'term':'Informative'}],2.0,5,2014-05-17
1,"After couple days hangar, I imagine not intell...",[{'term':'Informative'}],2.0,5,2014-05-18
2,A whitelist useful blacklist.,[{'term':'Informative'}],2.0,5,2014-05-29
4,All space notification area going to waste. No...,[{'term':'Informative'}],2.1,5,2014-07-06
5,"Great stylish app, low resource usage, virtual...",[{'term':'Informative'}],2.1,5,2014-07-06


In [None]:
commit_df.head()


Unnamed: 0,Commit ID,Commit Message,Author,Email,Date
0,879b544,Update README.mdreadme,Jeff Corcoran,jcorcoran+github@gmail.com,2019-09-25
1,ed4e9c1,Merge pull request #25 from rzabcik/master,Jeff Corcoran,jcorcoran+github@gmail.com,2019-09-25
2,f6c868a,Merge pull request #40 from Poussinou/patch-1,Jeff Corcoran,jcorcoran+github@gmail.com,2019-09-25
3,1.66E+07,Update README.md,Poussinou,jcorcoran+github@gmail.com,2019-09-21
4,3be6b4f,crash on launch caused by nullptr invalid icon...,Ryan Zabcik,jcorcoran+github@gmail.com,2017-07-15
