# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import networkx as nx

import math
import urllib.request
import io

from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_classif

from sklearn.feature_selection import SelectPercentile
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import LabelEncoder
from scipy.io import arff
from scipy.stats import ks_2samp

from networkx.algorithms import bipartite
from networkx import *
from networkx.algorithms.matching import max_weight_matching
from sklearn.model_selection import cross_val_score

from numpy import mean
from numpy import std

# Load Data

In [2]:
def loadfromfile(filename):
    data=arff.loadarff(filename)
    df=pd.DataFrame(data[0])
    return df
def loadfromurl(url):
    ftpstream=urllib.request.urlopen(url)
    data=arff.loadarff(io.StringIO(ftpstream.read().decode('utf-8')))
    df=pd.DataFrame(data[0])
    return df

In [3]:
source_path=r"Datasets\NASA\CM1.arff"
target_path=r"Datasets\NASA\MC1.arff"
source_df=loadfromfile(source_path)
target_df=loadfromfile(target_path)

## Source data preprocessing

In [4]:
last_col=source_df.iloc[:,-1:]
source_df.loc[:,'Defects']=1
labelencoder=LabelEncoder()
source_df['Defects']=labelencoder.fit_transform(last_col.values.ravel())
source_df=source_df.drop(source_df.columns[-2],axis=1)

In [5]:
s_X = source_df.drop(['Defects'],axis=1)
s_y = source_df['Defects']

## Target data preprocessing

In [6]:
last_col=target_df.iloc[:,-1:]
target_df.loc[:,'Defects']=1
labelencoder=LabelEncoder()
target_df['Defects']=labelencoder.fit_transform(last_col.values.ravel()) 
target_df=target_df.drop(target_df.columns[-2],axis=1)

In [7]:
t_X = target_df.drop(['Defects'],axis=1)
t_y = target_df['Defects']

In [8]:
t_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1988 entries, 0 to 1987
Data columns (total 38 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   LOC_BLANK                        1988 non-null   float64
 1   BRANCH_COUNT                     1988 non-null   float64
 2   CALL_PAIRS                       1988 non-null   float64
 3   LOC_CODE_AND_COMMENT             1988 non-null   float64
 4   LOC_COMMENTS                     1988 non-null   float64
 5   CONDITION_COUNT                  1988 non-null   float64
 6   CYCLOMATIC_COMPLEXITY            1988 non-null   float64
 7   CYCLOMATIC_DENSITY               1988 non-null   float64
 8   DECISION_COUNT                   1988 non-null   float64
 9   DESIGN_COMPLEXITY                1988 non-null   float64
 10  DESIGN_DENSITY                   1988 non-null   float64
 11  EDGE_COUNT                       1988 non-null   float64
 12  ESSENTIAL_COMPLEXITY

In [9]:
t_y

0       0
1       0
2       0
3       0
4       0
       ..
1983    0
1984    0
1985    0
1986    0
1987    0
Name: Defects, Length: 1988, dtype: int32

# Metric Selection in Source Datasets

In [10]:
selected_top_features=SelectPercentile(mutual_info_classif,percentile=15)
selected_top_features.fit(s_X,s_y)
top_features=s_X.columns[selected_top_features.get_support()]

In [11]:
top_features

Index(['CALL_PAIRS', 'LOC_COMMENTS', 'LOC_EXECUTABLE', 'HALSTEAD_CONTENT',
       'NUM_OPERATORS', 'NUM_UNIQUE_OPERANDS'],
      dtype='object')

# Matching Source and Target Metrics

In [12]:
ks_values=[]
for sinstances in top_features:
    for tinstances in t_X.columns:
        x=ks_2samp(source_df[sinstances],target_df[tinstances])
        ks_value=(sinstances,tinstances,x)
        ks_values.append(ks_value)

In [13]:
pvalues=[]
def myfunc(e):
    return e[2]
for x in ks_values:
    pair=(x[0],x[1],x[2].pvalue)
    pvalues.append(pair)
pvalues.sort(reverse=True,key=myfunc)
pvalues

[('NUM_UNIQUE_OPERANDS', 'HALSTEAD_CONTENT', 0.0723166933377617),
 ('LOC_EXECUTABLE', 'HALSTEAD_CONTENT', 0.0009477093096285083),
 ('NUM_UNIQUE_OPERANDS', 'NUMBER_OF_LINES', 0.00012245067805405352),
 ('NUM_UNIQUE_OPERANDS', 'NUM_OPERATORS', 0.0001071398716228078),
 ('CALL_PAIRS', 'LOC_BLANK', 3.217885129314091e-05),
 ('LOC_COMMENTS', 'PERCENT_COMMENTS', 1.8865872676410156e-05),
 ('LOC_EXECUTABLE', 'NUM_OPERATORS', 2.5121310260889373e-06),
 ('CALL_PAIRS', 'BRANCH_COUNT', 1.9636606721773475e-06),
 ('CALL_PAIRS', 'CYCLOMATIC_COMPLEXITY', 1.9636606721773475e-06),
 ('CALL_PAIRS', 'CALL_PAIRS', 1.7151198249409916e-06),
 ('LOC_COMMENTS', 'LOC_EXECUTABLE', 3.5113507768080865e-07),
 ('NUM_UNIQUE_OPERANDS', 'NUM_OPERANDS', 1.0594056876200852e-07),
 ('NUM_OPERATORS', 'HALSTEAD_LENGTH', 4.756962379559582e-09),
 ('LOC_COMMENTS', 'LOC_TOTAL', 4.082629789792236e-09),
 ('HALSTEAD_CONTENT', 'HALSTEAD_LENGTH', 7.687194214511806e-10),
 ('LOC_COMMENTS', 'CONDITION_COUNT', 2.2745505479093708e-10),
 ('CALL_

In [14]:
B = nx.Graph()
B.add_nodes_from(top_features, bipartite=0)
B.add_nodes_from(t_X.columns, bipartite=1)

for x in pvalues:
    u=x[0]
    v=x[1]
    w=x[2]
    if w>=0.05:
        B.add_edge(u, v, weight = w)

In [15]:
metric_matching= max_weight_matching(B)
metric_matching

{('HALSTEAD_CONTENT', 'NUM_UNIQUE_OPERANDS')}

In [16]:
target_metrics=[]
source_metrics=[]
for x in top_features:
    for y in metric_matching:
        if y[0]==x:
            target_metrics.append(y[1])
            source_metrics.append(y[0])
        if y[1]==x:
            target_metrics.append(y[0])
            source_metrics.append(y[1])
print(target_metrics, source_metrics)

['NUM_UNIQUE_OPERANDS', 'HALSTEAD_CONTENT'] ['HALSTEAD_CONTENT', 'NUM_UNIQUE_OPERANDS']


# Experimental Design

In [17]:
s_X=s_X[source_metrics]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(s_X, s_y, test_size=0.3, random_state=15)

## Logistic Regression

In [19]:
model=LogisticRegression(solver='liblinear')

In [20]:
model.fit(X_train,y_train)
predictions=model.predict(X_test)

In [21]:
confusion_matrix(y_test,predictions)

array([[91,  0],
       [ 7,  1]], dtype=int64)

In [22]:
accuracy_score(y_test,predictions)*100

92.92929292929293

In [23]:
predictions=model.predict(t_X[target_metrics])
confusion_matrix(t_y,predictions)

array([[1942,    0],
       [  46,    0]], dtype=int64)

In [24]:
accuracy_score(t_y,predictions)*100

97.68611670020121

## Random Forest

In [25]:
rf_model = RandomForestClassifier(n_estimators=100, bootstrap = True, max_features = 'sqrt')

rf_model.fit(X_train,y_train)

RandomForestClassifier(max_features='sqrt')

In [26]:
rf_model.fit(X_train,y_train)
pred=rf_model.predict(X_test)
confusion_matrix(y_test,pred)

array([[87,  4],
       [ 8,  0]], dtype=int64)

In [27]:
accuracy_score(y_test,pred)*100

87.87878787878788

In [28]:
pred=rf_model.predict(t_X[target_metrics])
confusion_matrix(t_y,pred)

array([[1920,   22],
       [  45,    1]], dtype=int64)

In [29]:
accuracy_score(t_y,pred)*100

96.6297786720322

## Support Vector Machine

In [30]:
classifier = SVC(kernel='rbf', random_state = 50)
classifier.fit(X_train,y_train)

SVC(random_state=50)

In [31]:
y_pred = classifier.predict(X_test)

In [32]:
accuracy=accuracy_score(y_test,y_pred)
print(accuracy*100)

91.91919191919192


In [33]:
y_pred = classifier.predict(t_X[target_metrics])

In [34]:
cm = confusion_matrix(t_y,y_pred)
cm

array([[1942,    0],
       [  46,    0]], dtype=int64)

In [35]:
accuracy=accuracy_score(t_y,y_pred)
print(accuracy*100)

97.68611670020121
