# RQ-3: How can we construct defect prediction models for infrastructure as code scripts using the identified source code properties?

In [35]:
import pandas as pd
import scipy
from sklearn.decomposition import PCA

from cliffs_delta import cliffs_delta

mozilla = pd.read_csv("../data/IST_MOZ.csv")

## PCA

In [36]:

X = mozilla.drop("defect_status", axis=1).drop("org", axis=1).drop("file_", axis=1)
mozilla

Unnamed: 0,org,file_,URL,File,Require,Ensure,Include,Attribute,Hard_coded_string,Command,File_mode,SSH_KEY,Lines_of_code,Comment,defect_status
0,MOZILLA,/Users/akond/PUPP_REPOS/mozilla-releng-downloa...,31,0,0,0,0,66,180,0,0,0,432,3,1
1,MOZILLA,/Users/akond/PUPP_REPOS/mozilla-releng-downloa...,1,0,0,0,129,229,364,0,0,0,1157,34,1
2,MOZILLA,/Users/akond/PUPP_REPOS/mozilla-releng-downloa...,2,0,0,0,0,0,18,0,0,0,65,3,0
3,MOZILLA,/Users/akond/PUPP_REPOS/mozilla-releng-downloa...,1,0,0,0,16,0,4,0,0,0,83,8,1
4,MOZILLA,/Users/akond/PUPP_REPOS/mozilla-releng-downloa...,6,0,0,0,0,6,30,0,0,0,110,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
575,MOZILLA,/Users/akond/PUPP_REPOS/mozilla-releng-downloa...,2,2,0,5,10,25,9,1,3,3,88,3,1
576,MOZILLA,/Users/akond/PUPP_REPOS/mozilla-releng-downloa...,2,1,3,1,5,10,3,1,0,0,27,3,1
577,MOZILLA,/Users/akond/PUPP_REPOS/mozilla-releng-downloa...,1,2,0,2,3,6,4,0,1,1,36,3,1
578,MOZILLA,/Users/akond/PUPP_REPOS/mozilla-releng-downloa...,0,0,0,3,3,8,9,0,0,0,55,16,0


In [47]:

pca = PCA(n_components=1)
result = pca.fit(X)
result.components_

array([[0.01020099, 0.00194333, 0.00315785, 0.00517546, 0.07642894,
        0.21146341, 0.26259574, 0.00213292, 0.00325757, 0.00325757,
        0.93768626, 0.03254893]])

### Variance for each components
|  | 0 |
| :--- | :--- |
| 0 | 9.587254e-01 |
| 1 | 1.987750e-02 |
| 2 | 1.035012e-02 |
| 3 | 5.422657e-03 |
| 4 | 4.079591e-03 |
| 5 | 5.190187e-04 |
| 6 | 4.671760e-04 |
| 7 | 2.523305e-04 |
| 8 | 1.405435e-04 |
| 9 | 1.099573e-04 |
| 10 | 5.572472e-05 |
| 11 | 9.856219e-36 |

### Weight of each attribute
|  | URL      | File     | Require  | Ensure   | Include  | Attribute | Hard coded string | Command  | File mode | SSH key  | Lines of code | Comment  |
| :--- |:---------|:---------|:---------|:---------|:---------|:----------|:------------------|:---------|:----------|:---------|:--------------|:---------|
| 0 | 0.010201 | 0.001943 | 0.003158 | 0.005175 | 0.076429 | 0.211463  | 0.262596          | 0.002133 | 0.003258  | 0.003258 | 0.937686      | 0.032549 |



In [4]:
data_for_models = result.transform(X)[:, 0]
d = data_for_models.reshape(-1, 1)

## Statistical Learners

In [5]:
X = mozilla.drop("org", axis=1).drop("file_", axis=1).drop("defect_status", axis=1)
Y = mozilla["defect_status"]

In [6]:

df = pd.DataFrame(d, columns=["value"]).merge(Y, left_index=True, right_index=True)
df

Unnamed: 0,value,defect_status
0,412.882790,1
1,1186.053801,1
2,11.959009,0
3,26.536428,1
4,58.615624,1
...,...,...
575,37.264749,1
576,-25.096501,1
577,-17.401614,1
578,2.558040,0


In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)


### Classification and Regression Tree (CART)

In [59]:
from src.modelsutils import dropcol
from sklearn import tree
import src.modelsutils as utils

clf = tree.DecisionTreeClassifier(min_samples_leaf=2)
clf = clf.fit(dropcol(train, "defect_status"), dropcol(train, 'value'))

utils.evaluate(utils.dropcol(test, 'defect_status'),
               utils.dropcol(test, 'value'),
               clf.predict)
utils.evaluate(utils.dropcol(train, 'defect_status'),
               utils.dropcol(train, 'value'),
               clf.predict)


Precision: 0.681
Recall: 0.604
F1-measure: 0.640
AUC: 0.683
Precision: 0.952
Recall: 0.777
F1-measure: 0.856
AUC: 0.873


(0.9523809523809523, 0.7766990291262136, 0.8556149732620322)

### K Nearest Neighbor (KNN)

In [9]:
from numpy import ravel
from sklearn.neighbors import KNeighborsClassifier
import modelsutils as utils

neigh = KNeighborsClassifier()
neigh.fit(utils.X(train), utils.Y(train))
utils.evaluate(utils.X(test), utils.Y(test), neigh.predict)

Precision: 0.547
Recall: 0.660
F1-measure: 0.598
AUC: 0.600


  return self._fit(X, y)


(0.546875, 0.660377358490566, 0.5982905982905983)

### Logistic Regression (LR)


In [10]:
from sklearn.linear_model import LogisticRegression
import modelsutils as utils

lr = LogisticRegression()
lr.fit(utils.X(train), utils.Y(train))
utils.evaluate(utils.X(test), utils.Y(test), lr.predict)
lr.predict_proba(utils.X(test))

Precision: 0.718
Recall: 0.528
F1-measure: 0.609
AUC: 0.677


  y = column_or_1d(y, warn=True)


array([[0.28454564, 0.71545436],
       [0.65870862, 0.34129138],
       [0.67907365, 0.32092635],
       [0.723135  , 0.276865  ],
       [0.46443098, 0.53556902],
       [0.65195551, 0.34804449],
       [0.68701849, 0.31298151],
       [0.5667801 , 0.4332199 ],
       [0.3004583 , 0.6995417 ],
       [0.61597489, 0.38402511],
       [0.09749395, 0.90250605],
       [0.36372235, 0.63627765],
       [0.39368811, 0.60631189],
       [0.74639177, 0.25360823],
       [0.74639177, 0.25360823],
       [0.26070399, 0.73929601],
       [0.77399949, 0.22600051],
       [0.66604934, 0.33395066],
       [0.31883007, 0.68116993],
       [0.75788526, 0.24211474],
       [0.57447733, 0.42552267],
       [0.38111057, 0.61888943],
       [0.5667801 , 0.4332199 ],
       [0.08112534, 0.91887466],
       [0.63956173, 0.36043827],
       [0.81621645, 0.18378355],
       [0.62225771, 0.37774229],
       [0.79173147, 0.20826853],
       [0.14406924, 0.85593076],
       [0.50973245, 0.49026755],
       [0.

### Naive Bayes (NB)


In [11]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(utils.X(train), utils.Y(train))
utils.evaluate(utils.X(test), utils.Y(test), gnb.predict)

Precision: 0.800
Recall: 0.377
F1-measure: 0.513
AUC: 0.649


  y = column_or_1d(y, warn=True)


(0.8, 0.37735849056603776, 0.5128205128205128)

### Random Forest (RF)


In [12]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()
clf.fit(utils.X(train), utils.Y(train))
utils.evaluate(utils.X(test), utils.Y(test), clf.predict)
ypred = clf.predict(utils.X(test))



  return fit_method(estimator, *args, **kwargs)


Precision: 0.655
Recall: 0.679
F1-measure: 0.667
AUC: 0.689


## Cross validation

In [32]:
def createdict(classifier, name):
    return {
        "classifier": classifier,
        "name": name
    }

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)

results = {}
for classifier in [createdict(RandomForestClassifier(), "RF"), 
                   createdict(GaussianNB(), "NB"),
                   createdict(LogisticRegression(), "LR"), 
                   createdict(KNeighborsClassifier(), "KNN"),
                   createdict(DecisionTreeClassifier(), "CART")]:
    auc = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='roc_auc').mean()
    recall = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='recall').mean()
    precision = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='precision').mean()
    f1 = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='f1').mean()
    results[classifier["name"]] = {
        "auc": auc,
        "recall": recall,
        "precision": precision,
        "f1": f1
    }
    

In [34]:
pd.DataFrame.from_dict(results)

Unnamed: 0,RF,NB,LR,KNN,CART
auc,0.731664,0.699599,0.756323,0.713161,0.69123
recall,0.649017,0.392519,0.565923,0.619417,0.627106
precision,0.642651,0.831862,0.7066,0.60417,0.64255
f1,0.645764,0.532261,0.62699,0.608864,0.633393


### Cross validation results
|  | RF | NB | LR | KNN | CART |
| :--- | :--- | :--- | :--- | :--- | :--- |
| auc | 0.731664 | 0.699599 | 0.756323 | 0.713161 | 0.691230 |
| recall | 0.649017 | 0.392519 | 0.565923 | 0.619417 | 0.627106 |
| precision | 0.642651 | 0.831862 | 0.706600 | 0.604170 | 0.642550 |
| f1 | 0.645764 | 0.532261 | 0.626990 | 0.608864 | 0.633393 |


## Mirantis

In [60]:
mirantis = pd.read_csv("../data/IST_MIR.csv")

X = mirantis.drop("defect_status", axis=1).drop("org", axis=1).drop("file_", axis=1)
pca = PCA()
result = pca.fit(X)
result.explained_variance_ratio_

array([[ 3.93693652e-03,  1.48759900e-02,  9.42483638e-01,
         2.28617506e-02,  2.92145507e-02,  5.32942110e-02,
         2.46056929e-01,  1.92548176e-01,  9.67309174e-02,
         1.85758292e-03,  1.20627100e-02,  1.21133323e-02],
       [ 1.01598678e-02,  2.16063760e-02, -2.52363571e-01,
         2.43563291e-02,  1.14630606e-01,  1.39326567e-01,
         8.21573881e-01,  3.05298252e-01, -3.65219278e-01,
         7.26369874e-03,  2.17416859e-02,  2.22715879e-02],
       [-1.77539922e-02, -3.56167739e-02,  1.14396767e-01,
        -1.95041136e-02, -2.60289429e-02, -5.82563256e-02,
        -4.50394826e-01,  4.30261179e-01, -7.69433337e-01,
        -7.47813768e-03, -1.96803520e-02, -2.06597486e-02],
       [ 2.02450325e-02,  1.10698127e-02, -1.85181202e-01,
        -2.41680001e-02, -1.35953941e-02, -4.36300633e-02,
        -1.26946971e-01,  8.26856917e-01,  5.12457527e-01,
         9.05731031e-03, -4.31760841e-04,  4.20891669e-04],
       [ 1.56415159e-02, -2.54002740e-01,  2.0731002

In [63]:
result.components_

array([9.59181729e-01, 1.87596675e-02, 1.29117278e-02, 7.11755824e-03,
       1.44652007e-03, 3.26996084e-04, 1.19911611e-04, 4.57041218e-05,
       3.69426043e-05, 2.67868685e-05, 2.59143197e-05, 5.41967257e-07])

### Explained PCA (Mirantis)
#### Components
|  | 0 |
| :--- | :--- |
| 0 | 9.591817e-01 |
| 1 | 1.875967e-02 |
| 2 | 1.291173e-02 |
| 3 | 7.117558e-03 |
| 4 | 1.446520e-03 |
| 5 | 3.269961e-04 |
| 6 | 1.199116e-04 |
| 7 | 4.570412e-05 |
| 8 | 3.694260e-05 |
| 9 | 2.678687e-05 |
| 10 | 2.591432e-05 |
| 11 | 5.419673e-07 |

#### Attributes weights
|         | URL      | File     | Require  | Ensure   | Include  | Attribute | Hard coded string | Command  | File mode | SSH key  | Lines of code | Comment  |
|:--------| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
| Weight | 0.003937 | 0.014876 | 0.942484 | 0.022862 | 0.029215 | 0.053294 | 0.246057 | 0.192548 | 0.096731 | 0.001858 | 0.012063 | 0.012113 |


In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

data_for_models = result.transform(X)[:, 0]
d = data_for_models.reshape(-1, 1)
Y = mirantis["defect_status"]
df = pd.DataFrame(d, columns=["value"]).merge(Y, left_index=True, right_index=True)

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)

results = {}
for classifier in [createdict(RandomForestClassifier(), "RF"),
                   createdict(GaussianNB(), "NB"),
                   createdict(LogisticRegression(), "LR"),
                   createdict(KNeighborsClassifier(), "KNN"),
                   createdict(DecisionTreeClassifier(), "CART")]:
    auc = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='roc_auc').mean()
    recall = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='recall').mean()
    precision = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='precision').mean()
    f1 = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='f1').mean()
    results[classifier["name"]] = {
        "auc": auc,
        "recall": recall,
        "precision": precision,
        "f1": f1
    }
pd.DataFrame.from_dict(results)


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

|  | RF | NB | LR | KNN | CART |
| :--- | :--- | :--- | :--- | :--- | :--- |
| auc | 0.701661 | 0.714252 | 0.750981 | 0.693334 | 0.659597 |
| recall | 0.707425 | 0.407360 | 0.649691 | 0.672546 | 0.707425 |
| precision | 0.701199 | 0.846909 | 0.798322 | 0.667389 | 0.701199 |
| f1 | 0.695896 | 0.541781 | 0.708236 | 0.663964 | 0.698448 |


## Wikimedia

In [65]:
wikimedia = pd.read_csv("../data/IST_WIK.csv")

X = wikimedia.drop("defect_status", axis=1).drop("org", axis=1).drop("file_", axis=1)
pca = PCA()
result = pca.fit(X)
result.explained_variance_ratio_

array([8.90520402e-01, 6.03917193e-02, 3.50979652e-02, 1.00422589e-02,
       1.91119910e-03, 1.17025305e-03, 3.83608951e-04, 1.85563806e-04,
       1.41746008e-04, 9.30937921e-05, 6.21473712e-05, 4.24885234e-08])

In [66]:
result.components_

array([[ 3.37492351e-03,  2.02216197e-02,  1.42577367e-02,
         2.36111732e-02,  4.39844868e-02,  1.88184812e-01,
         2.29110494e-01,  3.69083889e-03,  1.12838939e-02,
         1.14603932e-02,  9.34668061e-01,  1.87356544e-01],
       [-2.59034422e-03, -1.21789842e-02, -1.45820614e-02,
         2.47719026e-02, -3.71130979e-04,  2.61196504e-01,
         6.58631318e-01, -3.20574743e-03,  8.45814519e-03,
         9.72824689e-03, -7.38449134e-02, -7.00978022e-01],
       [ 7.38886003e-04,  7.24945471e-02,  5.07307943e-02,
         1.32570628e-01,  2.10472287e-01,  8.86019504e-01,
        -3.56840117e-01,  1.50034905e-02,  5.15744134e-02,
         5.36404010e-02, -1.09860237e-01,  9.99036049e-03],
       [-6.15141757e-03, -6.88690877e-02, -8.14845795e-02,
         7.71572613e-03, -1.59681848e-01,  2.41734824e-01,
         5.85741662e-01, -1.97285244e-02, -4.59214193e-02,
        -4.30771938e-02, -3.16518072e-01,  6.75984690e-01],
       [-2.79927579e-02,  4.12334399e-01,  1.1852125

In [67]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

data_for_models = result.transform(X)[:, 0]
d = data_for_models.reshape(-1, 1)
Y = wikimedia["defect_status"]
df = pd.DataFrame(d, columns=["value"]).merge(Y, left_index=True, right_index=True)

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)

results = {}
for classifier in [createdict(RandomForestClassifier(), "RF"),
                   createdict(GaussianNB(), "NB"),
                   createdict(LogisticRegression(), "LR"),
                   createdict(KNeighborsClassifier(), "KNN"),
                   createdict(DecisionTreeClassifier(), "CART")]:
    auc = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='roc_auc').mean()
    recall = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='recall').mean()
    precision = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='precision').mean()
    f1 = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='f1').mean()
    results[classifier["name"]] = {
        "auc": auc,
        "recall": recall,
        "precision": precision,
        "f1": f1
    }
pd.DataFrame.from_dict(results)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Unnamed: 0,RF,NB,LR,KNN,CART
auc,0.664721,0.709438,0.73627,0.69914,0.583164
recall,0.591171,0.366128,0.5862,0.627349,0.587493
precision,0.664007,0.885945,0.774041,0.732415,0.66662
f1,0.628276,0.515651,0.663515,0.673132,0.6234


### Results (Wikimedia)
#### PCA Components
|  | 0 |
| :--- | :--- |
| 0 | 8.905204e-01 |
| 1 | 6.039172e-02 |
| 2 | 3.509797e-02 |
| 3 | 1.004226e-02 |
| 4 | 1.911199e-03 |
| 5 | 1.170253e-03 |
| 6 | 3.836090e-04 |
| 7 | 1.855638e-04 |
| 8 | 1.417460e-04 |
| 9 | 9.309379e-05 |
| 10 | 6.214737e-05 |
| 11 | 4.248852e-08 |



#### Components ratios
|         | URL      | File     | Require  | Ensure   | Include  | Attribute | Hard coded string | Command  | File mode | SSH key  | Lines of code | Comment  |
| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
| 0 | 0.003375 | 0.020222 | 0.014258 | 0.023611 | 0.043984 | 0.188185 | 0.229110 | 0.003691 | 0.011284 | 0.011460 | 0.934668 | 0.187357 |
| 1 | -0.002590 | -0.012179 | -0.014582 | 0.024772 | -0.000371 | 0.261197 | 0.658631 | -0.003206 | 0.008458 | 0.009728 | -0.073845 | -0.700978 |



#### Cross-validation
|  | RF | NB | LR | KNN | CART |
| :--- | :--- | :--- | :--- | :--- | :--- |
| auc | 0.664721 | 0.709438 | 0.736270 | 0.699140 | 0.583164 |
| recall | 0.591171 | 0.366128 | 0.586200 | 0.627349 | 0.587493 |
| precision | 0.664007 | 0.885945 | 0.774041 | 0.732415 | 0.666620 |
| f1 | 0.628276 | 0.515651 | 0.663515 | 0.673132 | 0.623400 |


## Openstack

In [73]:
openstack = pd.read_csv("../data/IST_OST.csv")

X = openstack.drop("defect_status", axis=1).drop("org", axis=1).drop("file_", axis=1)
pca = PCA()
result = pca.fit(X)
result.explained_variance_ratio_

array([8.98178655e-01, 8.70372406e-02, 8.32671926e-03, 4.75485657e-03,
       1.05170262e-03, 2.02385775e-04, 1.67873823e-04, 1.44436378e-04,
       5.24208668e-05, 4.66811907e-05, 3.58745726e-05, 1.15311668e-06])

In [74]:
result.components_

array([[ 4.07868843e-03,  2.56626645e-03,  9.03592256e-01,
         3.89832304e-03,  1.02988023e-02,  1.91486459e-02,
         1.56927918e-01,  8.56853594e-02,  3.88624673e-01,
         9.08766849e-04,  2.28114030e-03,  2.52242943e-03],
       [-3.02845342e-03,  1.20317006e-02,  2.44985727e-01,
         1.70430755e-02,  2.95373105e-02,  4.96292508e-02,
         3.79899314e-01,  3.71528860e-01, -8.08501100e-01,
         5.31549595e-03,  8.49114955e-03,  8.39717047e-03],
       [-1.48866289e-02, -1.18610833e-02,  3.25624715e-01,
        -2.00723819e-02, -3.17068318e-02, -3.64494567e-02,
        -8.51954829e-01, -1.53354498e-01, -3.76152911e-01,
        -2.59326851e-03, -3.27154218e-03, -4.04829439e-03],
       [ 2.44599544e-02, -7.65231367e-03, -1.24934258e-01,
        -2.30038805e-02, -9.75710914e-02, -1.56056383e-01,
        -2.97485609e-01,  9.00693585e-01,  2.22346481e-01,
        -1.21003073e-02, -2.21358663e-04,  8.04195820e-04],
       [-1.04548682e-02,  1.60923029e-01, -4.2427938

In [75]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ShuffleSplit, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

data_for_models = result.transform(X)[:, 0]
d = data_for_models.reshape(-1, 1)
Y = openstack["defect_status"]
df = pd.DataFrame(d, columns=["value"]).merge(Y, left_index=True, right_index=True)

cv = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)

results = {}
for classifier in [createdict(RandomForestClassifier(), "RF"),
                   createdict(GaussianNB(), "NB"),
                   createdict(LogisticRegression(), "LR"),
                   createdict(KNeighborsClassifier(), "KNN"),
                   createdict(DecisionTreeClassifier(), "CART")]:
    auc = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='roc_auc').mean()
    recall = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='recall').mean()
    precision = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='precision').mean()
    f1 = cross_val_score(classifier["classifier"], utils.X(df), utils.Y(df), cv=cv, scoring='f1').mean()
    results[classifier["name"]] = {
        "auc": auc,
        "recall": recall,
        "precision": precision,
        "f1": f1
    }
pd.DataFrame.from_dict(results)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Unnamed: 0,RF,NB,LR,KNN,CART
auc,0.647741,0.694343,0.659972,0.659195,0.574832
recall,0.667616,0.368902,0.731321,0.687022,0.660176
precision,0.653112,0.847009,0.643218,0.661449,0.655685
f1,0.66044,0.512676,0.682243,0.673287,0.65736


### Results (Openstack)
#### PCA Components
|  | 0 |
| :--- | :--- |
| 0 | 0.898179 |
| 1 | 0.087037 |
| 2 | 0.008327 |
| 3 | 0.004755 |
| 4 | 0.001052 |
| 5 | 0.000202 |
| 6 | 0.000168 |
| 7 | 0.000144 |
| 8 | 0.000052 |
| 9 | 0.000047 |
| 10 | 0.000036 |
| 11 | 0.000001 |


#### Components ratios
|         | URL      | File     | Require  | Ensure   | Include  | Attribute | Hard coded string | Command  | File mode | SSH key  | Lines of code | Comment  |
| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
| 0 | 0.004079 | 0.002566 | 0.903592 | 0.003898 | 0.010299 | 0.019149 | 0.156928 | 0.085685 | 0.388625 | 0.000909 | 0.002281 | 0.002522 |
| 1 | -0.003028 | 0.012032 | 0.244986 | 0.017043 | 0.029537 | 0.049629 | 0.379899 | 0.371529 | -0.808501 | 0.005315 | 0.008491 | 0.008397 |


#### Cross-validation
|  | RF | NB | LR | KNN | CART |
| :--- | :--- | :--- | :--- | :--- | :--- |
| auc | 0.647741 | 0.694343 | 0.659972 | 0.659195 | 0.574832 |
| recall | 0.667616 | 0.368902 | 0.731321 | 0.687022 | 0.660176 |
| precision | 0.653112 | 0.847009 | 0.643218 | 0.661449 | 0.655685 |
| f1 | 0.660440 | 0.512676 | 0.682243 | 0.673287 | 0.657360 |
