# Load and Save Data Sets Labeled by Multiple Human Annotators

In [1]:
import numpy as np
import pandas as pd

from scipy.io import arff
from scipy.stats import mode

from sklearn.preprocessing import LabelEncoder

## Two datasets of defect reports labeled by a crowd of annotators of unknown reliability

### Resource: https://www.sciencedirect.com/science/article/pii/S2352340918303226#ec0005

In [2]:
data = arff.loadarff('../../data/reports-mozilla.arff')
df = pd.DataFrame(data[0]).values
X = df[:, 5:]
Y = df[:, :5]
replace_ids = ~np.isin(Y, [b'Installability', b'Maintenance', b'Reliability'])
Y[replace_ids] = b'Other'
le = LabelEncoder().fit([b'Installability', b'Maintenance', b'Reliability', b'Other'])
Y = np.array([le.transform(Y[:, i]) for i in range(5)]).T
y = mode(Y, axis=1)[0].ravel()
df_new = {'x_{}'.format(i): X[:, i] for i in range(X.shape[1])}
for i in range(Y.shape[1]):
    df_new['y_{}'.format(i)] = Y[:, i]
df_new['y'] = y
df_new = pd.DataFrame(df_new)
#df_new.to_csv('../../data/reports-mozilla.csv', index=False)

In [3]:
data = arff.loadarff('../../data/reports-compendium.arff')
df = pd.DataFrame(data[0]).values
X = df[:, 6:]
Y = df[:, 1:6]
replace_ids = ~np.isin(Y, [b'Installability', b'Requirements', b'Usability'])
Y[replace_ids] = b'Other'
le = LabelEncoder().fit([b'Installability', b'Requirements', b'Usability', b'Other'])
Y = np.array([le.transform(Y[:, i]) for i in range(5)]).T
y = mode(Y, axis=1)[0].ravel()
df_new = {'x_{}'.format(i): X[:, i] for i in range(X.shape[1])}
for i in range(Y.shape[1]):
    df_new['y_{}'.format(i)] = Y[:, i]
df_new['y'] = y
df_new = pd.DataFrame(df_new)
#df_new.to_csv('../../data/reports-compendium.csv', index=False)

##  Quality Assessment of Digital Colposcopies Data Set 

### Resource: https://archive.ics.uci.edu/ml/datasets/Quality+Assessment+of+Digital+Colposcopies

In [4]:
df = pd.read_csv('../../data/medical.csv')
y = df.values[:, -1]
Y = df.values[::, 62:-1]
X = df.values[:, :62]
df_new = {'x_{}'.format(i): X[:, i] for i in range(X.shape[1])}
for i in range(Y.shape[1]):
    df_new['y_{}'.format(i)] = Y[:, i]
df_new['y'] = y
df_new = pd.DataFrame(df_new)
#df_new.to_csv('../../data/medical.csv', index=False)
