In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('../Dataset/Dataset_Manual_Reproduction.csv', index_col=0)
df['Time for the Accepted Answer'] = df['Time for the Accepted Answer'].apply(lambda x: x.split(' ')[0])

In [4]:
print ('Total time taken to reproduce bugs: {} mins'.format(df['Time Taken'].astype(int).sum()))

Total time taken to reproduce bugs: 6712 mins


In [135]:
# Correlation between reproducible and code snippet present
df['Code Snippet Present?'] = df['Code Snippet Present?'].apply(lambda x: 1 if x == 'Yes' else 0)
df['Reproducible?'] = df['Reproducible?'].apply(lambda x: 1 if x == 'Yes' else 0)

df[['Code Snippet Present?', 'Reproducible?']].corr()

Unnamed: 0,Code Snippet Present?,Reproducible?
Code Snippet Present?,1.0,0.231869
Reproducible?,0.231869,1.0


In [136]:
# Correlation between reproducible and data description present
df['Data Description Present?'] = df['Data Description Present?'].apply(lambda x: 1 if x == 'Yes' else 0)
df[['Data Description Present?', 'Reproducible?']].corr()

Unnamed: 0,Data Description Present?,Reproducible?
Data Description Present?,1.0,0.3417
Reproducible?,0.3417,1.0


In [137]:
# Correlation between reproducible and system configuration
df['System Configuration Present?'] = df['System Configuration Present?'].apply(lambda x: 1 if x == 'Yes' else 0)
df[['System Configuration Present?', 'Reproducible?']].corr()

Unnamed: 0,System Configuration Present?,Reproducible?
System Configuration Present?,1.0,-0.180702
Reproducible?,-0.180702,1.0


In [138]:
# Get the Framework Distribution
df['Framework'].value_counts()

Framework
TF + Keras    68
PyTorch       34
Name: count, dtype: int64

In [139]:
df['Type of Bug'].value_counts()
df.groupby(['Type of Bug', 'Reproducible?']).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

Type of Bug           Type of Bug           Reproducible?
API Bug               API Bug               0                 15.384615
                                            1                 84.615385
GPU Bug               GPU Bug               0                 57.142857
                                            1                 42.857143
Mixed Bug             Mixed Bug             1                100.000000
Model Bug             Model Bug             0                 13.043478
                                            1                 86.956522
Tensor and Input Bug  Tensor and Input Bug  0                 19.230769
                                            1                 80.769231
Training Bug          Training Bug          0                 10.344828
                                            1                 89.655172
dtype: float64

In [140]:
df['Architecture'].value_counts()
df.groupby(['Architecture', 'Reproducible?']).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))

Architecture            Architecture            Reproducible?
-                       -                       0                 21.739130
                                                1                 78.260870
Attention               Attention               1                100.000000
Autoencoder             Autoencoder             0                 50.000000
                                                1                 50.000000
BERT                    BERT                    1                100.000000
CNN                     CNN                     0                 18.181818
                                                1                 81.818182
DenseNet                DenseNet                0                100.000000
GAN                     GAN                     1                100.000000
Gaussian Mixture Model  Gaussian Mixture Model  1                100.000000
LR                      LR                      1                100.000000
LSTM                    LS

In [141]:
# Correlation between accepted answer and time taken
df[['Time for the Accepted Answer', 'Time Taken']].corr()

Unnamed: 0,Time for the Accepted Answer,Time Taken
Time for the Accepted Answer,1.0,-0.102317
Time Taken,-0.102317,1.0


### Analysis by Bug Type

In [142]:
df = df[df['Reproducible?'] == 1]
unique_bug_types = df['Type of Bug'].unique()

In [143]:
training_bugs = df[df['Type of Bug'] == 'Training Bug']
model_bugs = df[df['Type of Bug'] == 'Model Bug']
api_bugs = df[df['Type of Bug'] == 'API Bug']
tensor_bugs = df[df['Type of Bug'] == 'Tensor and Input Bug']
gpu_bugs = df[df['Type of Bug'] == 'GPU Bug']
mixed_bugs = df[df['Type of Bug'] == 'Mixed Bug']


In [144]:
print ('Unique architectures for Training Bugs: ', training_bugs['Architecture'].unique())
print ('Unique architectures for Model Bugs: ', model_bugs['Architecture'].unique())
print ('Unique architectures for API Bugs: ', api_bugs['Architecture'].unique())
print ('Unique architectures for Tensor and Input Bugs: ', tensor_bugs['Architecture'].unique())
print ('Unique architectures for GPU Bugs: ', gpu_bugs['Architecture'].unique())
print ('Unique architectures for Mixed Bugs: ', mixed_bugs['Architecture'].unique())

Unique architectures for Training Bugs:  ['MLP' 'CNN' '-' 'LSTM' 'ResNet' 'RCNN' 'Transfer Learning' 'Autoencoder']
Unique architectures for Model Bugs:  ['MLP' 'CNN' 'VGG16' 'Transformers' 'LSTM + MLP' 'Attention' 'BERT'
 'Gaussian Mixture Model']
Unique architectures for API Bugs:  ['-' 'MLP' 'GAN' 'Variational RNN' 'CNN' 'Transformers' 'VGG19']
Unique architectures for Tensor and Input Bugs:  ['CNN' 'MLP' '-' 'LR' 'ResNet' 'GAN' 'NLP']
Unique architectures for GPU Bugs:  ['-']
Unique architectures for Mixed Bugs:  ['-' 'LR' 'CNN']
