In [ ]:
import sys
sys.path.append('../')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from src.eda_methods import show_barplot, load_data, load_data_all
from pandas.plotting import register_matplotlib_converters
from scipy.stats import boxcox, skew, shapiro, probplot, ttest_rel, ttest_ind
from sklearn import preprocessing

pd.set_option('display.max_colwidth', 125)
register_matplotlib_converters()

In [ ]:
# IMPORTANT - Figure out how to correctly convert old hashes to new hashes

# project_name = 'zeppelin'
# try:
#     tmp =\
#         pd.read_csv(
#             'assets/data/{0}/new_{0}_commits.csv'.format(project_name),
#             nrows=None,
#             header=None
#         )
# except UnicodeDecodeError:
#     tmp =\
#         pd.read_csv(
#             'assets/data/{0}/new_{0}_commits.csv'.format(project_name),
#             nrows=None,
#             header=None,
#             encoding='utf-16'
#         )
# tmp = tmp.set_index(0).apply(pd.to_datetime, axis=1)
# tmp['dates_match'] = tmp.apply(lambda x: x.duplicated().sum(), axis=1).astype('bool')

# # finds
# grouped = tmp.groupby(2)
# g_index =\
#     [
#             x for x in grouped[1].apply(
#                 lambda x: pd.to_datetime(x[0], utc=True) 
#                 if (x.count() > 1).sum() else ''
#             ).values if x
#     ]
# tmp = tmp.reset_index().set_index(1)
# tmp.index = pd.to_datetime(tmp.index, utc=True, infer_datetime_format=True)

# transforms date column into datetime_index
# new_commits =\
#     pd.Series(
#         tmp[0].values,
#         index=pd.to_datetime(
#                 tmp[1].values,
#                 infer_datetime_format=True,
#                 utc=True
#             )
#     )
# new_commits.name = project_name

In [ ]:
old, new, cc, bic = load_data_all()

In [ ]:
# shows duplicated bug introducing commits (BIC) for each project
print('Total and duplicated bug introducing commits (BIC) for each project:\n\n\t\t\t(total, duplicated)')
bic.groupby(level=0).apply(lambda x: (x.count(), x.duplicated().sum()))

In [ ]:
# removes duplicated BICs
print('Total BICs after removal of duplicates:\n')

project_bic = bic.groupby(level=0)

# saves unique BICs
ubic = project_bic.apply(lambda x: x.drop_duplicates(keep='first')).droplevel(0, axis=0)

print(ubic.groupby(level=0, axis=0).count())
# del bic

In [ ]:
# '''
# shows all conflicting 'created_at' commits
# proceed with manual investigation of how to solve this (HUGE ISSUE!!!)
# '''
# old.droplevel(0, axis=0)[old.droplevel(0, axis=0).index.duplicated(keep=False)]
# new.droplevel(0, axis=0)[new.droplevel(0, axis=0).index.duplicated(keep=False)]

In [ ]:
# new.reset_index().iloc[old.reset_index().index][0].values
'''
prints projects with different number of commits
'''
new_g = new.groupby(level=0)
old_g = old.groupby(level=0)

for ng, og in zip(new_g, old_g):
    # ng[1] = ng[1][ng[1].index.isin(og[1].index)]
    if len(ng[1])-1 != len(og[1]):
        print(ng[0], (len(ng[1]) - len(og[1]))-1)

In [ ]:
# '''
# ensures that only commits present in 'old' are considered
# '''
# new_g = new.groupby(level=0)
# old_g = old.groupby(level=0)
# tmp = []

# for ng, og in zip(new_g, old_g):
#     tmp.append(ng[1][ng[1].index.isin(og[1].index)])

# new = pd.concat(tmp)
# del tmp
# del new_g, old_g

In [ ]:
# drops the latest commit of each project, since it is actually from the tool not the project itself
new = new_g.apply(lambda x: x.iloc[1:]).droplevel(0)

In [ ]:
# aggregates old and new commits to facilitate conversion
convert = pd.DataFrame()

convert['old_hash'] = old
convert['new_hash'] =\
    new.reset_index()\
    .iloc[old.reset_index()\
    .index][0].values

# converts bic commits to their equivalent in 'new'
nbic =\
    convert.set_index('old_hash')\
    .squeeze()[ubic.values]\
    .dropna()\
    .reset_index(drop=True)

del convert

In [ ]:
'''
selects relevant support count (>1) and confidence (>=0.5)
'''
# selects cochange of at least two components with confidence higher that 0.5
rcc = cc[
    (cc.support_count > 1) & (cc.confidence >= 0.5)
].reset_index(drop=True)

print(
    'Selects components with relevant cochange dependency\n',
    'Of {0} entries, {1} ({2:.2f}%) were found to have support_count > 1 and confidence >= 0.5'
    .format(len(cc), len(rcc), (100*len(rcc)/len(cc)))
)

del cc

In [ ]:
{1,2,3,4,5,6} - {1,2,3}

In [ ]:
import itertools

# operates with sets due to performance gains
set_rcc = set(itertools.chain.from_iterable(rcc['commit_hash'].values))
set_bic = set(bic.values)

# defines implicit functions to compute count and percentage metrics
get_total_bic_count = lambda x: len(set_bic) - (len(set_bic - set(x.values)))
get_total_bic_percent = lambda x: (len(set_bic) - (len(set_bic - set(x.values)))) / len(x)

get_cochange_count = lambda x: len(set(x.values) & set(set_rcc))
get_cochange_bic_count = lambda x: len(set(x.values) & set(set_rcc) & set(nbic.values))
get_cochange_bic_percent = lambda x:\
    len(set(x.values) & set(set_rcc) & set(nbic.values)) / len(set(x.values) & set_rcc) if len(set(x.values) & set_rcc) != 0 else 0.0

get_nocochange_count = lambda x: len(set(x.values) - set(set_rcc))
get_nocochange_bic_count = lambda x: len((set(x.values) - set(set_rcc)) & set(nbic.values))
get_nocochange_bic_percent = lambda x:\
    len((set(x.values) - set(set_rcc)) & set(nbic.values)) / len(set(x.values) - set_rcc) if len(set(x.values) & set_rcc) != 0 else 0.0

# applies count functions over each group (project)
total_bic_df =\
    old_g.agg(['count', get_total_bic_count, get_total_bic_percent])\
    .rename(columns={
        '<lambda_0>': 'bic_count', '<lambda_1>': 'bic_percent'
    })

cc_bic_df =\
    new_g.agg([get_cochange_count, get_cochange_bic_count, get_cochange_bic_percent])\
    .rename(columns={
        '<lambda_0>': 'cochange_count', '<lambda_1>': 'bic_count', '<lambda_2>': 'bic_percent'
    })

nocc_bic_df =\
    new_g.agg([get_nocochange_count, get_nocochange_bic_count, get_nocochange_bic_percent])\
    .rename(columns={
        '<lambda_0>': 'nocochange_count', '<lambda_1>': 'bic_count', '<lambda_2>': 'bic_percent'
    })

## Assumptions of the paired t-test
* The dependent variable (DV) must be continuous which is measured on an interval or ratio scale
* The DV should not contain any significant outliers
* The DV should be approximately normally distributed
    * Testing for normality needs to be conducted on the differences between the two conditions, not the raw values of each condition itself
    * If there is a violation of normality, as long as it’s not in a major violation, the test results can be considered valid

In [ ]:
'''
Assumption 2. Outliers
'''
mean = cc_bic_df['bic_percent'].mean()
std = cc_bic_df['bic_percent'].std()
cc_bic = cc_bic_df['bic_percent'][cc_bic_df['bic_percent'].apply(lambda x: (x > mean - 2*std) and (x < mean + 2*std))]

mean = total_bic_df['bic_percent'].mean()
std = total_bic_df['bic_percent'].std()
total_bic = total_bic_df['bic_percent'][total_bic_df['bic_percent'].apply(lambda x: (x > mean - 2*std) and (x < mean + 2*std))]

In [ ]:
print(
    '\t\t---Summary of Variables---\n\n',
    'Percentage of BICs in all commits:\n',
    total_bic.describe(),
    '\n------------------------------------------------------\n'
    '\nPercentage of BICs in commits with cochange dependency:\n',
    cc_bic.describe(),
)

In [ ]:
'''
test both samples for normality
'''
print(
    '\t\t---Shapiro Wilk Normality Test - Individual Samples---\n',
    '\tAssume normality if p-value > 0.05\n\n'
    '\t\tTotal BICs\n W-statistic\t\tp-value\n',
    shapiro(total_bic.values),
    '\n\n\t\tCochange BICs\n W-statistic\t\tp-value\n',
    shapiro(cc_bic.values)
)

In [ ]:
'''
pairs the two variables
'''
total_bic = total_bic[total_bic.index.isin(cc_bic.index)]
cc_bic = cc_bic[cc_bic.index.isin(total_bic.index)]

In [ ]:
'''
Assumption 3. Normality of the differences
'''
bic_diff = total_bic - cc_bic
print(
    '\t\t---Shapiro Wilk Normality Test - Paired Samples---\n',
    '\tAssume normality if p-value > 0.05\n\n'
    '\tDifference of BICs (total - cocahnge)\n W-statistic\t\tp-value\n',
    shapiro(bic_diff.values),
    '\n\n'
)
bic_diff.plot(kind='hist')
plt.show()

In [ ]:
plot = probplot(bic_diff, plot=plt)

### Normality can be assumed.

In [ ]:
ttest_rel(total_bic, cc_bic)

In [ ]:
mean = cc_bic_df['bic_percent'].mean()
std = cc_bic_df['bic_percent'].std()
cc_bic = cc_bic_df['bic_percent'][cc_bic_df['bic_percent'].apply(lambda x: (x > mean - 2*std) and (x < mean + 2*std))]

mean = nocc_bic_df['bic_percent'].mean()
std = nocc_bic_df['bic_percent'].std()
nocc_bic = nocc_bic_df['bic_percent'][nocc_bic_df['bic_percent'].apply(lambda x: (x > mean - 2*std) and (x < mean + 2*std))]

In [ ]:
print(
    '\t\t---Summary of Variables---\n\n',
    'Percentage of BICs in commits without cochange dependency:\n',
    nocc_bic.describe(),
    '\n------------------------------------------------------\n'
    '\nPercentage of BICs in commits with cochange dependency:\n',
    cc_bic.describe(),
)

In [ ]:
'''
test both samples for normality
'''
print(
    '\t\t---Shapiro Wilk Normality Test - Individual Samples---\n',
    '\tAssume normality if p-value > 0.05\n\n'
    '\t\tCochange BICs\n W-statistic\t\tp-value\n',
    shapiro(cc_bic.values),
    '\n\n\t\tNo-Cochange BICs\n W-statistic\t\tp-value\n',
    shapiro(nocc_bic.values)
)

In [ ]:
# plot1 = probplot(cc_bic, plot=plt)
plot2 = probplot(nocc_bic, plot=plt)

In [ ]:
ttest_ind(cc_bic, nocc_bic)

## Assumptions of the unpaired t-test

* Assumption of independence
* Assumption of normality: the dependent variable should be continuous and approximately normally distributed.
* Assumption of homogeneity of variance: The variances of the dependent variable should be equal.



In [ ]:
total_bic = total_bic_df['bic_percent']
total_bic.name = 'all_commits'

cc_bic = cc_bic_df['bic_percent']
cc_bic.name = 'cochange_commit'

bic_percent = pd.concat([total_bic, cc_bic], keys=['all', 'cochange'], names=['commits', 'projects'], axis=0, sort=False)

In [ ]:
bic_percent = bic_percent.reset_index(level=0).rename(columns={0: 'bic_percent'})
bic_percent

In [ ]:
sns.set(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})


pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(bic_percent, row='commits', hue='commits', aspect=15, height=.5, palette=pal)

g.map(sns.kdeplot, 'bic_percent', clip_on=False, shade=True, alpha=1, lw=1.5, bw=.2)
g.map(sns.kdeplot, 'bic_percent', clip_on=False, color="w", lw=2, bw=.2)
g.map(plt.axhline, y=0, lw=2, clip_on=True)

# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color,
            ha="left", va="center", transform=ax.transAxes)


g.map(label, 'bic_percent')

# Set the subplots to overlap
g.fig.subplots_adjust(hspace=-.25)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[])
g.despine(bottom=True, left=True)

plt.xlabel('Kernel density estimation')

plt.tight_layout
plt.show()

# sns.violinplot(
#     ax=ax,
#     # data=bic_percent,
#     x=bic_percent['commits'],
#     y=bic_percent['bic_percent'],
#     # hue=bic_percent['commits'],
#     inner="quartile",
#     split=True,
# )

In [ ]:
sns.set(style="white")
fig, ax = plt.subplots(1, 1, figsize=(20, 6))

pd.concat([total_bic_df['count'], bic_agg], axis=1).plot(kind='bar', ax=ax, logscale=True)


In [ ]:
total_bic_count = total_bic_df['bic_count']
total_bic_count.name = 'BIC count in all commits'

cc_bic_count = cc_bic_df['bic_count']
cc_bic_count.name = 'BIC count in commits with cocahnge dependency'

bic_agg = pd.concat([total_bic_count, cc_bic_count], axis=1)

In [ ]:
# def transform_skew(df: pd.DataFrame) -> pd.DataFrame:
#     '''
#     Transforms data with Boxcox transformation
#     Keeps the transformation only if it is less skewed than the original
#     '''
#     data = df.values
#     posdata = data[data > 0]
#     # posdata = eda.data[eda.data[feat]>0][feat]

#     x, lmbda = boxcox(posdata, lmbda=None)
    
#     transform = np.empty_like(data)
#     transform[data > 0] = x
#     transform[data == 0] = -1/lmbda

#     if abs(skew(transform)) < abs(skew(data)):
#         return transform

#     else:
#         return data

In [ ]:
# '''
# standardizes (and transforms skewness)
# '''
# print('> performs standardization on both variables...')
# std_cc_bic = preprocessing.scale(cc_bic)
# std_total_bic = preprocessing.scale(total_bic)

In [ ]:
# print(
#     '\t\t---Shapiro Wilk Normality Test - Individual Samples---\n',
#     'Assume normality if p-value > 0.05\n\n'
#     '\t\tTotal BICs\n W-statistic\t\tp-value\n',
#     shapiro(std_total_bic),
#     '\n\n\t\tCochange BICs\n W-statistic\t\tp-value\n',
#     shapiro(std_cc_bic)
# )

In [ ]:
# bic_diff = std_total_bic - std_cc_bic
# print(
#     '\t\t---Shapiro Wilk Normality Test---\n',
#     'Assume normality if p-value > 0.05\n\n'
#     '\t\tDifference of BICs (total - cocahnge)\n W-statistic\t\tp-value\n',
#     shapiro(bic_diff),
#     '\n\n'
# )
# print(pd.Series(bic_diff).plot(kind='hist'))

In [ ]:
# set(new.values) & set_rcc & set(nbic.values)
# new_g.apply(lambda x: len(set(x.values) & set(set_rcc) & set(nbic.values)))