# Clean SLF4J Data

In [1]:
import numpy as np
import pandas as pd
import warnings

## Load Dataframe

In [2]:
# Read dataset from CSV file
df = pd.read_csv('../src/main/resources/slf4j-upgrades.csv')
df

Unnamed: 0,l1_group_id,l1_artifact_id,l1_version,l1_release_date,l2_group_id,l2_artifact_id,l2_version,l2_release_date,c1_group_id,c1_artifact_id,c1_version,c1_release_date,c2_group_id,c2_artifact_id,c2_version,c2_release_date
0,org.slf4j,slf4j-api,1.6.1,2010-07-05T20:29:21Z[GMT],org.slf4j,slf4j-api,1.6.2,2011-08-18T22:13:03Z[GMT],cn.eova,eova,1.5.0,2016-12-20T19:53:17Z[GMT],cn.eova,eova,1.6-beta1,2017-05-25T14:48:55Z[GMT]
1,org.slf4j,slf4j-api,1.6.1,2010-07-05T20:29:21Z[GMT],org.slf4j,slf4j-api,1.6.2,2011-08-18T22:13:03Z[GMT],org.sonatype.sisu,sisu-inject-bean,2.2.3,2011-06-08T20:18:37Z[GMT],org.sonatype.sisu,sisu-inject-bean,2.3.0,2011-09-13T04:13:25Z[GMT]
2,org.slf4j,slf4j-api,1.6.1,2010-07-05T20:29:21Z[GMT],org.slf4j,slf4j-api,1.6.2,2011-08-18T22:13:03Z[GMT],org.sonatype.sisu,sisu-guice,3.0.3,2011-05-11T13:45:19Z[GMT],org.sonatype.sisu,sisu-guice,3.1.0,2011-09-03T00:01:30Z[GMT]
3,org.slf4j,slf4j-api,1.6.1,2010-07-05T20:29:21Z[GMT],org.slf4j,slf4j-api,1.6.2,2011-08-18T22:13:03Z[GMT],org.sonatype.sisu.inject,guice-plexus-lifecycles,2.2.3,2011-06-08T20:22:49Z[GMT],org.sonatype.sisu.inject,guice-plexus-lifecycles,2.3.0,2011-09-13T04:15:26Z[GMT]
4,org.slf4j,slf4j-api,1.6.1,2010-07-05T20:29:21Z[GMT],org.slf4j,slf4j-api,1.6.2,2011-08-18T22:13:03Z[GMT],org.sonatype.sisu.inject,guice-plexus-binders,2.2.3,2011-06-08T20:22:08Z[GMT],org.sonatype.sisu.inject,guice-plexus-binders,2.3.0,2011-09-13T04:15:04Z[GMT]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12936,org.slf4j,slf4j-api,1.7.18,2016-02-26T18:51:08Z[GMT],org.slf4j,slf4j-api,1.7.25,2017-03-16T16:36:32Z[GMT],com.avides.spring,spring-enricher,1.0.1.RELEASE,2017-11-22T15:15:13Z[GMT],com.avides.spring,spring-enricher,1.0.2.RELEASE,2017-11-27T00:27:09Z[GMT]
12937,org.slf4j,slf4j-api,1.7.+,2016-08-10T15:08:35Z[GMT],org.slf4j,slf4j-api,1.7.7,2014-04-04T11:12:47Z[GMT],org.nickelproject,nickel,0.0.4,2014-10-14T05:19:53Z[GMT],org.nickelproject,nickel,0.0.5,2014-12-13T06:35:52Z[GMT]
12938,org.slf4j,slf4j-api,1.8.0-alpha0,2017-04-07T09:46:08Z[GMT],org.slf4j,slf4j-api,1.8.0-alpha1,2017-04-13T13:43:50Z[GMT],com.netflix.evcache,evcache-client,4.82.0,2017-04-12T17:45:26Z[GMT],com.netflix.evcache,evcache-client,4.83.0,2017-04-14T02:22:43Z[GMT]
12939,org.slf4j,slf4j-api,1.8.0-alpha1,2017-04-13T13:43:50Z[GMT],org.slf4j,slf4j-api,1.8.0-alpha2,2017-04-25T22:11:59Z[GMT],com.netflix.falcor,falcor-model,0.80.0,2017-04-20T07:23:41Z[GMT],com.netflix.falcor,falcor-model,0.81.0,2017-05-01T23:23:04Z[GMT]


## Clean Dataset

### Remove duplicate client upgrades
This is a known issue in the MDG. We should get rid of these faulty cases to avoid inconsistencies.

In [3]:
# Client upgrades
client_upgrades_all = df.loc[:, ['c1_group_id', 'c1_artifact_id', 'c1_version', 'c2_version']]
len_cua = len(client_upgrades_all)
print(f'Client upgrades: {len_cua}')

Client upgrades: 12941


In [4]:
# Duplicate client upgrades
client_upgrades_dup = client_upgrades_all[client_upgrades_all.duplicated(keep=False)]
len_cud = len(client_upgrades_dup)
print(f'Duplicate client upgrades: {len_cud}')

Duplicate client upgrades: 401


In [5]:
# Unique client upgrades
client_upgrades = client_upgrades_all.merge(client_upgrades_dup, how='outer', indicator=True).loc[lambda x : x['_merge']=='left_only']
len_cu = len(client_upgrades)
print(f'Unique client upgrades: {len_cu}')

Unique client upgrades: 12540


In [6]:
# Sanity check: number of upgrades
assert (len_cua - len_cud) == len_cu, f'The number of unique upgrades ({len_cu}) does not match expected value ({len_cua - len_cud})'

In [7]:
# Remove duplicates from dataframe
df = df.drop(client_upgrades_dup.index)

### Remove non-semver compliant library releases
Just keep semver (semnatic versioning) compliant library releases--that is, releases that have the following version number $X.Y.Z-P$, where:
- $X$ represents the **major** number;
- $Y$ represents the **minor** number;
- $Z$ represents the **patch** number (optional), and;
- $P$ represents the **pre-release** label (optional).

In [8]:
df.loc[:, ['l1_group_id', 'l1_artifact_id', 'l1_version', 'l2_version']].drop_duplicates().to_csv('../src/main/resources/c.csv')

In [9]:
# Remove UserWarning
warnings.filterwarnings('ignore')

# Defining semver regex
semver_regex = r'^\d+\.\d+(\.\d+(-.+)?)?$'

In [10]:
# Non-semver compliant upgrades
len_nscu = len(df[~df['l1_version'].str.contains(semver_regex)
  | ~df['l2_version'].str.contains(semver_regex)])
print(f'Non-semver compliant upgrades: {len_nscu}')

Non-semver compliant upgrades: 35


In [11]:
# Semver-compliant upgrades
len_scu = len(df[df['l1_version'].str.contains(semver_regex)
  & df['l2_version'].str.contains(semver_regex)])
print(f'Semver-compliant upgrades: {len_scu}')

Semver-compliant upgrades: 12505


In [12]:
# Sanity check
assert (len(df) - len_nscu) == len_scu, f'The number of semver-compliant upgrades ({len_scu}) does not match expected value ({len(df) - len_nscu})'

In [13]:
# Remove non-semver compliant upgrades
df = df[df['l1_version'].str.contains(semver_regex)
  & df['l2_version'].str.contains(semver_regex)]

## Write Dataset

In [14]:
# Write clean dataset in CSV file
df.to_csv('../src/main/resources/clean-slf4j-upgrades.csv')

In [15]:
# Unique clients
df.loc[:, ['c1_group_id', 'c1_artifact_id']].drop_duplicates()

Unnamed: 0,c1_group_id,c1_artifact_id
0,cn.eova,eova
1,org.sonatype.sisu,sisu-inject-bean
2,org.sonatype.sisu,sisu-guice
3,org.sonatype.sisu.inject,guice-plexus-lifecycles
4,org.sonatype.sisu.inject,guice-plexus-binders
...,...,...
12914,de.kaufhof,ha-jobs_2.11
12916,cz.cvut.kbss.jopa,jopa-api
12917,cz.cvut.kbss.jopa,jopa-all
12933,com.github.mike10004,xvfb-manager
