In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score

In [38]:
## Camel
originalDf = pd.read_csv('Datasets/Original/CM1.csv')

In [57]:
# normalize the data
originalDf['LOC_BLANK'] / originalDf['LOC_BLANK'].max()

0      0.054878
1      0.115854
2      0.000000
3      0.012195
4      0.030488
         ...   
322    0.048780
323    0.298780
324    0.103659
325    0.006098
326    0.060976
Name: LOC_BLANK, Length: 327, dtype: float64

In [40]:
## Camel clean and normalized
cleanDf = pd.read_csv('Datasets/Preprocessed/Step1_Cleaned/CM1_Clean.csv')

In [58]:
cleanDf

Unnamed: 0,LOC_BLANK,BRANCH_COUNT,CALL_PAIRS,LOC_CODE_AND_COMMENT,LOC_COMMENTS,CONDITION_COUNT,CYCLOMATIC_COMPLEXITY,CYCLOMATIC_DENSITY,DECISION_COUNT,DECISION_DENSITY,...,NODE_COUNT,NORMALIZED_CYLOMATIC_COMPLEXITY,NUM_OPERANDS,NUM_OPERATORS,NUM_UNIQUE_OPERANDS,NUM_UNIQUE_OPERATORS,NUMBER_OF_LINES,PERCENT_COMMENTS,LOC_TOTAL,Defective
0,0.054878,0.012579,0.115385,0.0250,0.005900,0.016393,0.010638,0.230769,0.017241,0.000000,...,0.020325,0.166667,0.040302,0.048282,0.057878,0.191176,0.029139,0.264841,0.024194,0
1,0.115854,0.000000,0.038462,0.0250,0.000000,0.000000,0.000000,0.038462,0.000000,0.000000,...,0.073171,0.041667,0.078086,0.050736,0.099678,0.117647,0.060927,0.079383,0.056452,0
2,0.000000,0.037736,0.000000,0.0000,0.000000,0.024590,0.031915,1.000000,0.000000,1.000000,...,0.016260,1.000000,0.007557,0.011457,0.012862,0.073529,0.001325,0.000000,0.004032,0
3,0.012195,0.075472,0.076923,0.0125,0.026549,0.081967,0.063830,0.288462,0.068966,0.133333,...,0.052846,0.270833,0.079345,0.092471,0.128617,0.220588,0.059603,0.272626,0.070565,0
4,0.030488,0.012579,0.038462,0.0000,0.000000,0.016393,0.010638,0.211538,0.017241,0.000000,...,0.016260,0.208333,0.026448,0.031915,0.045016,0.147059,0.022517,0.000000,0.026210,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,0.048780,0.025157,0.000000,0.0250,0.041298,0.024590,0.021277,0.384615,0.017241,0.166667,...,0.016260,0.312500,0.018892,0.021277,0.025723,0.191176,0.018543,0.717503,0.020161,0
323,0.298780,0.238994,0.538462,0.2000,0.109145,0.303279,0.202128,0.461538,0.310345,0.016667,...,0.325203,0.229167,0.396725,0.405074,0.299035,0.602941,0.203974,0.652023,0.139113,0
324,0.103659,0.113208,0.269231,0.0000,0.023599,0.131148,0.095745,0.307692,0.120690,0.083333,...,0.158537,0.250000,0.105793,0.099836,0.122186,0.308824,0.095364,0.176561,0.096774,0
325,0.006098,0.000000,0.153846,0.0000,0.008850,0.000000,0.000000,0.134615,0.000000,0.000000,...,0.012195,0.125000,0.020151,0.028642,0.045016,0.176471,0.019868,0.189629,0.024194,0


In [43]:
cleanDf['Defective'].value_counts()

0    285
1     42
Name: Defective, dtype: int64

In [48]:
## Camel Balanced using SMOTE - Non defective is bigger - method for handling class imbalance
balancedDf = pd.read_csv('Datasets/Preprocessed/Step2_Balanced/CM1_Clean_Balanced.csv')

In [45]:
balancedDf['Defective'].value_counts()

0    285
1    285
Name: Defective, dtype: int64

In [46]:
## Camel Feature selection with gain ratio. Select most informative features
gainRatioDf = pd.read_csv('Datasets/Preprocessed/Step3_GainRatio/CM1_Clean_Balanced_GainRatio.csv')

In [50]:

gainRatioDf


Unnamed: 0,PERCENT_COMMENTS,LOC_COMMENTS,HALSTEAD_ERROR_EST,HALSTEAD_VOLUME,HALSTEAD_DIFFICULTY,HALSTEAD_EFFORT,HALSTEAD_PROG_TIME,HALSTEAD_CONTENT,LOC_TOTAL,NUM_UNIQUE_OPERANDS,...,NUMBER_OF_LINES,DECISION_COUNT,NUM_OPERANDS,BRANCH_COUNT,DESIGN_COMPLEXITY,NODE_COUNT,LOC_BLANK,MODIFIED_CONDITION_COUNT,MULTIPLE_CONDITION_COUNT,Defective
0,0.264841,0.005900,0.030631,0.029791,0.098527,0.003639,0.003639,0.103989,0.024194,0.057878,...,0.029139,0.017241,0.040302,0.012579,0.032258,0.020325,0.054878,0.015385,0.016393,0
1,0.079383,0.000000,0.043243,0.042442,0.075022,0.004058,0.004058,0.199086,0.056452,0.099678,...,0.060927,0.000000,0.078086,0.000000,0.016129,0.073171,0.115854,0.000000,0.000000,0
2,0.000000,0.000000,0.005405,0.005059,0.033003,0.000319,0.000319,0.043137,0.004032,0.012862,...,0.001325,0.000000,0.007557,0.037736,0.000000,0.016260,0.000000,0.046154,0.040984,0
3,0.272626,0.026549,0.064865,0.064555,0.100378,0.007754,0.007754,0.238178,0.070565,0.128617,...,0.059603,0.068966,0.079345,0.075472,0.080645,0.052846,0.012195,0.092308,0.081967,0
4,0.000000,0.000000,0.019820,0.018754,0.064075,0.001658,0.001658,0.095027,0.026210,0.045016,...,0.022517,0.017241,0.026448,0.012579,0.032258,0.016260,0.030488,0.015385,0.016393,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
565,0.518852,0.126916,0.157806,0.157832,0.169695,0.031140,0.031140,0.358436,0.175839,0.237839,...,0.224430,0.122723,0.182233,0.089534,0.114805,0.158636,0.245146,0.109507,0.116687,1
566,0.429408,0.030702,0.028788,0.027584,0.100096,0.003462,0.003462,0.093984,0.040874,0.059997,...,0.043376,0.039972,0.037326,0.046028,0.053522,0.036403,0.018703,0.056295,0.048997,1
567,0.336489,0.021670,0.031504,0.030285,0.107797,0.003974,0.003973,0.097340,0.047907,0.064210,...,0.041162,0.050530,0.040351,0.048573,0.062282,0.045278,0.000422,0.059408,0.055674,1
568,0.325237,0.035842,0.089851,0.089547,0.208080,0.020348,0.020348,0.165298,0.088674,0.135987,...,0.076423,0.024565,0.129422,0.021483,0.041393,0.042880,0.015648,0.021919,0.023357,1


In [53]:
features = gainRatioDf.drop('Defective', axis=1)
labels = gainRatioDf['Defective']

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.2)

In [54]:
# peform a Decision Tree Classifier on the gain ratio dataset

clf = DecisionTreeClassifier()
clf.fit(train_features, train_labels)

# predict on the test set
pred_labels = clf.predict(test_features)

# evaluate performance
print('Precision: ', precision_score(test_labels, pred_labels))
print('Recall: ', recall_score(test_labels, pred_labels))
print('F1: ', f1_score(test_labels, pred_labels))


Precision:  0.796875
Recall:  0.8793103448275862
F1:  0.8360655737704917


In [55]:

# peform a Random Forest Classifier on the gain ratio dataset
clf = RandomForestClassifier()
clf.fit(train_features, train_labels)

# predict on the test set
pred_labels = clf.predict(test_features)

# evaluate performance
print('Precision: ', precision_score(test_labels, pred_labels))
print('Recall: ', recall_score(test_labels, pred_labels))
print('F1: ', f1_score(test_labels, pred_labels))


Precision:  0.8870967741935484
Recall:  0.9482758620689655
F1:  0.9166666666666667
