In [None]:
# Usehttp://localhost:8888/notebooks/work/toolkit/caseStudy.ipynb#First,-we-gather-the-dataset.-This-is-a-history-of-static-code-metrics-(C-and-indentation)-and-change-metrics-for-our-project.d to access the toolkit modules in this directory
import toolkit

### First, we gather the dataset. This is a history of static code metrics (Java, C and indentation) and change metrics for our project.

In [None]:
#https://github.com/jacobgarcia/linked-lists.git

In [None]:
# Used to indicate where the data should be gathered and stored

rootDirectory = '../dataSets/linkedListsStudy/'
metricsDataList = toolkit.data.gatherTimeMetrics(rootDirectory, 'https://github.com/jacobgarcia/linked-lists.git', rootDirectory+'linked-lists/', '*/*.java *.java', ['java'], skipEvery=5)


In [None]:
metricsDataList

In [None]:
# Used to indicate where the data should be gathered and stored
rootDirectory = '../dataSets/springFrameworkStudy/'

# Call gatherTimeMetrics and measure Java, Indent and Change metrics 
# on .java files from the git project's repository
#metricsData = toolkit.data.gatherTimeMetrics(rootDirectory, 'https://github.com/spring-projects/spring-framework.git', rootDirectory+'spring-framework/', '*/*.java *.java', ['indent','java'], skipEvery=1000)


rootDirectory = '../dataSets/springFrameworkStudy/'
metricsDataSpring = toolkit.data.gatherTimeMetrics(rootDirectory, 'https://github.com/spring-projects/spring-framework.git', rootDirectory+'spring-framework/', '*/*.java *.java', ['indent','java'], skipEvery=50)
#caseStudyAnalysis(metricsDataOpenSSL)

### How many times did we sample from?

In [None]:
print metricsData['times']
print metricsData

### How many features and samples are in our dataset?

In [None]:
print metricsData['data'].shape

### How many unique source files were measured?

In [None]:
print metricsData['data']['entity'].nunique()

In [None]:
print metricsData['data'].head

### Let's see what affects the net churn of files 
### Which types of files have net churn above and below the mean net churn?

In [None]:
# We instantiate the scikit-learn decision tree classification model
# It is trained with a maximum number of leaf nodes
# Samples are binned so as to maximize information gain at higher nodes ('entropy')
from sklearn.tree import DecisionTreeClassifier
modelInstance = DecisionTreeClassifier(max_leaf_nodes=8, criterion='entropy')
modelSimpler = DecisionTreeClassifier(max_leaf_nodes=4, criterion='entropy')
churnModel = toolkit.refinement.makeAndUpdateModel(rootDirectory, metricsData['data'], 2, 'netchurn', modelInstance, modelSimpler, visualize=True, scoreOnly=False) 

### Some observations:
- The model has very good Precision, Recall and F1-Score: net churn above/below the mean is classified very well by this model
- ROC area under curve is very high: very little compromise between false negative rate and false positive rate
- The model says that the features influencing net churn are (strongest to weakest):
    - Number of lines added
    - Number of lines deleted    
- The 'net churn below the mean' class is over-represented in our data (3 times as many samples as the other class)
    - However, the model still performs well without any steps taken to address class imbalance (e.g. under/over-sampling)
- Interpretation of the visualized decision tree is straightforward:
    - 63% of samples were files with less than 34 lines added 
        - These samples had net churn less than the mean
        - Some of these may be very stable files (over the history of the project)
    - The files with net churn greater than the mean had more than 108 lines added
    - Within this group, there are several subgroups with varying levels of churn

### In the above step, we only performed 2-fold cross validation (1 training set, 1 test set)

### How does this approach perform with more cross-validation folds in time?

In [None]:
# We split the data into 5 equally-sized groups, 
# then perform cross-validation while gradually adding these groups to the training set

# i.e. the train-test splits are with groups of size:
# 1-4, 2-3, 3-2, 4-1

# We omit visualization of decision trees to save space,
# but they can be shown with visualize=True as above
folds = 5
churnModelMoreFolds = toolkit.refinement.makeAndUpdateModel(rootDirectory, metricsData['data'], folds, 'netchurn', modelInstance, modelSimpler, scoreOnly=False) 

### Each of the subsets still exhibits class imbalance (but not with the same ratio)
### In particular, the 2nd train-test split has the most balanced classes (2:1) among the five splits

### We still see very good performance, and the same features are important throughout
### How far can we go? Let's cross-validate on every sampled time!

In [None]:
folds = metricsData['times'] # This is an attribute of the measured data set: the number of time points measured
#churnModelMoreFolds = toolkit.refinement.makeAndUpdateModel(rootDirectory, metricsData['data'], folds, 'netchurn', modelInstance, scoreOnly=False) 

### Results of this step are omitted for printing. However, the large cross-validation can be run to see them.

### The individual data sets used for training and testing are quite small and imbalanced.

### Many of the same relationships still show up.

### Why is 'added' a much more important factor than 'deleted'? 

In [None]:
print metricsData['data']['netchurn'].mean()
print metricsData['data']['netchurn'].var()
print metricsData['data']['netchurn'].std()
print metricsData['data']['netchurn'].max()
print metricsData['data']['netchurn'].min()

### This codebase is *growing* in general (more added than deleted)

### Some files must experience more churn than others. We know from some of the motivating literature that defects can be correlated with large pre-release churn.

### Let's make some categories of binned churn data and classify them 

In [None]:
churnBinnedCategories = ['churnLow','churnMedium','churnHigh','churnHigher', 'churnHighest']
dataSetUpdated = toolkit.utilities.addBinnedResponseCategory(metricsData['data'], 'netchurn', churnBinnedCategories)

from sklearn.tree import DecisionTreeClassifier
modelInstance = DecisionTreeClassifier(max_leaf_nodes=8, criterion='entropy')
churnModelCategories = toolkit.refinement.makeAndUpdateModel(rootDirectory, dataSetUpdated, 2, churnBinnedCategories, modelInstance, modelSimpler, visualize=True, scoreOnly=False) 

### Now we're seeing something interesting. The vast majority of the files exhibit very low amounts of churn. A select few files receive most of the lines added/deleted. Does the class imbalance impact the validity of this model? Let's try more cross-validation to see.

In [None]:
folds = 3
churnModelCategories = toolkit.refinement.makeAndUpdateModel(rootDirectory, dataSetUpdated, folds, churnBinnedCategories, modelInstance, modelSimpler, visualize=True, scoreOnly=False) 

### Let's look at this from another point of view. What characterises the files which have the most lines added?

In [None]:
addedModel = toolkit.refinement.makeAndUpdateModel(rootDirectory, metricsData['data'], 2, 'added', modelInstance, modelSimpler, visualize=True, scoreOnly=False) 

### Net churn and deleted lines are strongly related. What do we find if we're not allowed to use these in our decision tree?

In [None]:
alteredData = metricsData['data'].drop(['netchurn','deleted'], axis=1)
addedModel = toolkit.refinement.makeAndUpdateModel(rootDirectory, alteredData, 2, 'added', modelInstance, modelSimpler, visualize=True, scoreOnly=False) 

### The model uses n-revs as the most important feature, but it does not classify '# lines added above the mean' very well

In [None]:
alteredData2 = metricsData['data'].drop(['netchurn','deleted','n-revs'], axis=1)
addedModel2 = toolkit.refinement.makeAndUpdateModel(rootDirectory, alteredData2, 2, 'added', modelInstance, modelSimpler, visualize=True, scoreOnly=False) 

### n-authors has similar problems with identifying the minority class

In [None]:
alteredData3 = metricsData['data'].drop(['netchurn','deleted','n-revs','n-authors'], axis=1)
addedModel3 = toolkit.refinement.makeAndUpdateModel(rootDirectory, alteredData3, 2, 'added', modelInstance, modelSimpler, visualize=True, scoreOnly=False) 

### fractal-value is derived from n-revs and n-authors

### Let's get rid of it and build a regression model for nline
### This model will predict the sizes of files based on their other static features

In [None]:
from sklearn.tree import DecisionTreeRegressor
modelInstanceR = DecisionTreeRegressor(max_leaf_nodes=8)
modelInstanceRsimpler = DecisionTreeRegressor(max_leaf_nodes=4)
alteredData4 = metricsData['data'].drop(['netchurn','deleted','n-revs','n-authors','fractal-value'], axis=1)
nlineModelR = toolkit.refinement.makeAndUpdateModel(rootDirectory, alteredData4, 2, 'nline', modelInstanceR, modelInstanceRsimpler, visualize=False, scoreOnly=False)

### We see that the number of indented lines is a very good predictor of the number of lines

### Is the model using indent_lines because it is correlated with nline?

In [None]:
# We use the Spearman measure of rank correlation
metricsData['data']['nline'].corr(metricsData['data']['indent_lines'], method='spearman')

### Let's remove indent_lines from this data set
### Are we still able to regress on nline (and with high performance)?

In [None]:
alteredData5 = metricsData['data'].drop(['indent_lines','nchar','nstatement','nidentifier'], axis=1)
nlineModelR2 = toolkit.refinement.makeAndUpdateModel(rootDirectory, alteredData5, 2, 'nline', modelInstanceR, modelInstanceRsimpler, visualize=False, scoreOnly=False)

### cqmetrics provides several measures of the 'number of functions' contained in a file (each calculated differently)

### The model uses these to predict the size of files

In [None]:
alteredData6 = metricsData['data'].drop(['indent_lines','nchar','nstatement','nidentifier', 'nfunction','nfunction2','nfunction3'], axis=1)
nlineModelR3 = toolkit.refinement.makeAndUpdateModel(rootDirectory, alteredData6, 2, 'nline', modelInstanceR, modelInstanceRsimpler, visualize=False, scoreOnly=False)

In [None]:
alteredData7 = metricsData['data'].drop(['indent_lines','nchar','nstatement','nidentifier', 'nfunction','nfunction2','nfunction3', 'unique_nidentifier'], axis=1)
nlineModelR4 = toolkit.refinement.makeAndUpdateModel(rootDirectory, alteredData7, 2, 'nline', modelInstanceR, modelInstanceRsimpler, visualize=False, scoreOnly=False)

### Our regression model of nline needs the number of (unique) identifiers, and the number of functions in a file to explain the variance in our dataset

### Without these features, the model rapidly loses accuracy.

### From all the features we measure, the only ones which are strong predictors of size are other measures of such (which are bound to be correlated - lines of source code necessarily add identifiers, operators, functions etc as counted by Halstead's metrics)

### We see that the maximum Halstead complexity metric among the functions in each file is a (weak) predictor under this model. The measures from which it is calculated have a much stronger correlation with size.

### What about modelling these measures of complexity?

In [None]:
cycloData = metricsData['data'].drop(['cyclomatic_sd', 'cyclomatic_mean'], axis=1)
cycloModelR = toolkit.refinement.makeAndUpdateModel(rootDirectory, cycloData, 2, 'cyclomatic_max', modelInstanceR, modelInstanceRsimpler, visualize=False, scoreOnly=False)

### Cyclomatic complexity seems to be similar to Halstead complexity for our dataset
### The measures which are used to derive both of these are also predictors. Let's remove them and repeat..

In [None]:
cycloData2 = metricsData['data'].drop(['halstead_sd','nidentifier','halstead_mean','halstead_min','cyclomatic_sd', 'cyclomatic_mean', 'halstead_max','nstatement','statement_nesting_mean'], axis=1)
cycloModelR = toolkit.refinement.makeAndUpdateModel(rootDirectory, cycloData2, 2, 'cyclomatic_max', modelInstanceR, modelInstanceRsimpler, visualize=False, scoreOnly=False)

### The number of static variables and the mean indentation level of files are strong predictors of cyclomatic complexity for our dataset

### This indentation predictor is similar to the findings of Hindle. 
### What about the 'ninternal' (static linkage) result? The files containing functions with higher cyclomatic complexity also have more variables (which are shared between functions in the same file)?
### This may be starting to give some insight into our codebase. Perhaps our code overuses file-global variables together with functions which are difficult to test.

### Can we model indentation? What leads to 'wider' files?

In [None]:
indentData = metricsData['data'].drop(['indent_sd','indent_median','indent_max','indent_lines'],axis=1)
indentModelR = toolkit.refinement.makeAndUpdateModel(rootDirectory, indentData, 2, 'indent_mean', modelInstanceR, modelInstanceRsimpler, visualize=False, scoreOnly=False)

### The files with more nesting (which drives Halstead's complexity) are more indented

### This tells us that our codebase uses indentation to *indicate* nesting frequently. This is typical in C programming, of course. However, there is value in this seemingly simple result: to locate the files with high syntax-driven measures of complexity in this codebase, we can use a heuristic like the level of indentation instead.

### It is important to also consider that our dataset does not have other measures of complexity which have *not* been represented by this model.  Halstead and McCabe's measures are dominant in the measurement of C programs, but other measures of complexity which are not strongly connected with structural *nesting* may not be predicted by indentation. In other words, indentation does not necessarily predict *complexity* - it predicts *Halstead and McCabe complexity*.

In [None]:
indentModelC = toolkit.refinement.makeAndUpdateModel(rootDirectory, indentData, 2, 'indent_mean', modelInstance, modelSimpler, visualize=False, scoreOnly=False)

### We can use the DecisionTreeClassifier to bin samples above and below the mean indentation level, more effectively than we can predict the indentation level itself via regression.

### To what degree is this true? Let's try adding more categories as before.

In [None]:
indentBinnedCategories = ['iLow','iMedium','iHigh','iVeryHigh']
dataSetUpdated = toolkit.utilities.addBinnedResponseCategory(indentData, 'indent_mean', indentBinnedCategories)
indentModelC = toolkit.refinement.makeAndUpdateModel(rootDirectory, dataSetUpdated, 2, indentBinnedCategories, modelInstance, modelSimpler, visualize=False, scoreOnly=False)

### This model suffers from class imbalance: with only 8 leaves, it loses accuracy when predicting the files with the most indentation. This is a common issue with decision trees being exposed by our dataset. It is not a property of our codebase or of C source code.

### In this case, we can use an *ensemble model* to make up for the loss of accuracy in the minority class. However, we do so at the expense of model interpretation. 

### We build a random forest: a group of decision trees are made with the first decision chosen at random. The group of trees is used to classify each sample, and a majority voting scheme decides the model output.

### This has the effect of creating more trees. Some may resemble the above single tree (which accurately modelled *most* of our data). Others may be highly inaccurate, except for small subsets of the data. 

### The training of this model is 'embarassingly parallel': we use all the available CPU cores in parallel to create our decision trees.

### Forest models can be difficult to tune, and to interpret. We omit visualization of the many trees. For automated classification tasks used in a production environment (as opposed to empirical research), forests may be valuable despite their lack of interpretability.

In [None]:
# For parallel construction of forest models
import psutil
cores = psutil.cpu_count()

from sklearn.ensemble import RandomForestClassifier
# We build a forest of n_estimators trees, with no restriction on the breadth/depth of trees.
modelCF = RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=cores)
modelCFsimpler = RandomForestClassifier(n_estimators=20, max_leaf_nodes=2, criterion='entropy', n_jobs=cores) # Each tree makes one decision
updatedModelCF = toolkit.refinement.makeAndUpdateModel(rootDirectory, dataSetUpdated, 2, indentBinnedCategories, modelCF, modelCFsimpler, scoreOnly=False, visualize=False)

### The resulting forest together uses many features. Many of these are named entities - our trees are individually modelling the files in our codebase. This showcases a possible threat to the use of forest models with our dataset: there may be a tendency to model the one-hot encoded entities.

### We also note that the minority class is predicted with high Precision, but low Recall. From the high-indentation files we predict, they are predicted correctly. However, most of the high-indentation files are missed even by this model. The F1-Score (the harmonic mean of Precision and Recall) is similarly low.

### Since the training of a random forest includes random choices (the initial splits in each tree), the results of this cell may vary each time it is run. The models before and after the update are likely to differ.

### In this case study, we have used the toolkit to do the following:
- Gather the Git dataset
- Create classification models of net churn: above and below the mean, in five binned categories
- Create regression models of file size and cyclomatic complexity
- Create regression and classification models of mean indentation 
    - Including a brief test of random forests to improve classification performance with a very small minority class we wish to predict: the files with the highest mean indentation level