# Clean SLF4J Data

In [1]:
import numpy as np
import pandas as pd
import warnings

## Load Dataset
We load the Maven co-upgrades dataset from a CSV file. The file must have the following headers:

- **l_group_id:** group ID of the library.
- **l_artifact_id:** artifact ID of the library.
- **l_version1:** version of the old release of the library.
- **l_version2:** version of the new release of the library.
- **c_group_id:** group ID of the client.
- **c_artifact_id:** artifact ID of the client.
- **l_version1:** version of the old release of the client.
- **l_version2:** version of the new release of the client.


The file has been exported from the Neo4j Desktop app after executing the following cypher query:

```
MATCH (c1:Artifact)-[:DEPENDS_ON]->(l1:Artifact),
    (l1:Artifact)-[:NEXT*]->(l2:Artifact),
    (c2:Artifact)-[:DEPENDS_ON]->(l2:Artifact),
    (c1:Artifact)-[:NEXT]->(c2:Artifact)
WHERE l1.groupID = 'org.slf4j' 
    AND l1.artifact = 'slf4j-api'
    AND c1.groupID <> l1.groupID
RETURN DISTINCT l1.groupID as l_group_id, 
    l1.artifact as l_artifact_id,
    l1.version as l_version1,
    l2.version as l_version2,
    c1.groupID as c_group_id,
    c1.artifact as c_artifact_id,
    c1.version as c_version1,
    c2.version as c_version2
```

In [2]:
# Read dataset from CSV file
df = pd.read_csv('../src/main/resources/slf4j-coupgrades.csv')
df

Unnamed: 0,l_group_id,l_artifact_id,l_version1,l_version2,c_group_id,c_artifact_id,c_version1,c_version2
0,org.slf4j,slf4j-api,1.6.1,1.6.2,cn.eova,eova,1.5.0,1.6-beta1
1,org.slf4j,slf4j-api,1.6.1,1.6.2,org.sonatype.sisu,sisu-inject-bean,2.2.3,2.3.0
2,org.slf4j,slf4j-api,1.6.1,1.6.2,org.sonatype.sisu,sisu-guice,3.0.3,3.1.0
3,org.slf4j,slf4j-api,1.6.1,1.6.2,org.sonatype.sisu.inject,guice-plexus-lifecycles,2.2.3,2.3.0
4,org.slf4j,slf4j-api,1.6.1,1.6.2,org.sonatype.sisu.inject,guice-plexus-binders,2.2.3,2.3.0
...,...,...,...,...,...,...,...,...
12936,org.slf4j,slf4j-api,1.7.18,1.7.25,com.avides.spring,spring-enricher,1.0.1.RELEASE,1.0.2.RELEASE
12937,org.slf4j,slf4j-api,1.7.+,1.7.7,org.nickelproject,nickel,0.0.4,0.0.5
12938,org.slf4j,slf4j-api,1.8.0-alpha0,1.8.0-alpha1,com.netflix.evcache,evcache-client,4.82.0,4.83.0
12939,org.slf4j,slf4j-api,1.8.0-alpha1,1.8.0-alpha2,com.netflix.falcor,falcor-model,0.80.0,0.81.0


## Clean Dataset

### Remove duplicate client upgrades
This is a known issue in the MDG. We should get rid of these faulty cases to avoid inconsistencies.

In [3]:
# Client upgrades
client_upgrades_all = df.loc[:, ['c_group_id', 'c_artifact_id', 'c_version1', 'c_version2']]
len_cua = len(client_upgrades_all)
print(f'Client upgrades: {len_cua}')

Client upgrades: 12941


In [4]:
# Duplicate client upgrades
client_upgrades_dup = client_upgrades_all[client_upgrades_all.duplicated(keep=False)]
len_cud = len(client_upgrades_dup)
print(f'Duplicate client upgrades: {len_cud}')

Duplicate client upgrades: 401


In [5]:
# Unique client upgrades
client_upgrades = client_upgrades_all.merge(client_upgrades_dup, how='outer', indicator=True).loc[lambda x : x['_merge']=='left_only']
len_cu = len(client_upgrades)
print(f'Unique client upgrades: {len_cu}')

Unique client upgrades: 12540


In [6]:
# Sanity check: number of upgrades
assert (len_cua - len_cud) == len_cu, f'The number of unique upgrades ({len_cu}) does not match expected value ({len_cua - len_cud})'

In [7]:
# Remove duplicates from dataframe
df = df.drop(client_upgrades_dup.index)

### Remove non-semver compliant library releases
Just keep semver (semnatic versioning) compliant library releases--that is, releases that have the following version number $X.Y.Z-P$, where:
- $X$ represents the **major** number;
- $Y$ represents the **minor** number (optional);
- $Z$ represents the **patch** number (optional), and;
- $P$ represents the **pre-release** label (optional).

In [8]:
df.loc[:, ['l_group_id', 'l_artifact_id', 'l_version1', 'l_version2']].drop_duplicates().to_csv('../src/main/resources/c.csv')

In [9]:
# Remove UserWarning
warnings.filterwarnings('ignore')

# Defining semver regex
semver_regex = r'^\d+(\.\d+(\.\d+(-.+)?)?)?$'

In [10]:
# Non-semver compliant upgrades
len_nscu = len(df[~df['l_version1'].str.contains(semver_regex)
  | ~df['l_version2'].str.contains(semver_regex)])
print(f'Non-semver compliant upgrades: {len_nscu}')

Non-semver compliant upgrades: 35


In [11]:
# Semver-compliant upgrades
len_scu = len(df[df['l_version1'].str.contains(semver_regex)
  & df['l_version2'].str.contains(semver_regex)])
print(f'Semver-compliant upgrades: {len_scu}')

Semver-compliant upgrades: 12505


In [12]:
# Sanity check
assert (len(df) - len_nscu) == len_scu, f'The number of semver-compliant upgrades ({len_scu}) does not match expected value ({len(df) - len_nscu})'

In [13]:
# Remove non-semver compliant upgrades
df = df[df['l_version1'].str.contains(semver_regex)
  & df['l_version2'].str.contains(semver_regex)]

## Write Dataset

In [14]:
# Write clean dataset in CSV file
df.to_csv('../src/main/resources/clean-slf4j-coupgrades.csv')

In [15]:
# Unique clients
df.loc[:, ['c_group_id', 'c_artifact_id']].drop_duplicates()

Unnamed: 0,c_group_id,c_artifact_id
0,cn.eova,eova
1,org.sonatype.sisu,sisu-inject-bean
2,org.sonatype.sisu,sisu-guice
3,org.sonatype.sisu.inject,guice-plexus-lifecycles
4,org.sonatype.sisu.inject,guice-plexus-binders
...,...,...
12914,de.kaufhof,ha-jobs_2.11
12916,cz.cvut.kbss.jopa,jopa-api
12917,cz.cvut.kbss.jopa,jopa-all
12933,com.github.mike10004,xvfb-manager
