In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from sklearn import linear_model, tree, svm

In [17]:
pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
Collecting et-xmlfile
  Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10
Note: you may need to restart the kernel to use updated packages.


## Load data (analogy)

In [33]:
def load_data(path, effort_label, to_drop):
    if path.endswith('csv'):
        dataset = pd.read_csv(path).drop(to_drop, axis=1).replace(-1, np.nan).dropna()
    elif path.endswith('xlsx'):
        dataset = pd.read_excel(path).drop(to_drop, axis=1).replace(-1, np.nan).dropna()
    else:
        return None

    train = dataset.iloc[:-1]
    test  = dataset.iloc[-1]
    train_X = train.drop(effort_label, axis=1)
    test_X = test.drop(effort_label)
    train_y = train[effort_label]
    test_y = test[effort_label]

    return (train_X, train_y, test_X, test_y)

In [4]:
def interval01(train_X, test_X):
    max_X = np.max(train_X, axis=0)
    min_X = np.min(train_X, axis=0)

    train_X_adj = (train_X - min_X) / (max_X - min_X)
    test_X_adj = (test_X - min_X) / (max_X - min_X)

    return (train_X_adj, test_X_adj)

In [5]:
def calculate_nn(train_X, test_X, categorical_label):

    train_X_adj, test_X_adj = interval01(train_X, test_X)

    numerical_distance = (train_X_adj.drop(categorical_label, axis=1) - test_X_adj.drop(categorical_label)) ** 2
    categorical_distance = (1*(train_X_adj[categorical_label] == test_X_adj[categorical_label]))
    euc_distance = np.sqrt(np.sum(pd.concat([numerical_distance, categorical_distance], axis=1), axis=1)/np.shape(train_X)[0])
    rank = np.argsort(euc_distance).values
    return rank

In [6]:
def uavg(rank, train_y, k):
    estimate_effort = np.mean(train_y[rank[:k]])
    return estimate_effort

In [7]:
def irwm(rank, train_y, k):
    estimate_effort = np.sum((list(range(k,0,-1)) * train_y[rank[:k]])/np.sum(range(k+1)))
    return estimate_effort

In [8]:
def lsa(rank, train_y, k, train_X, test_X, size_label):
    software_size_train = train_X[size_label]
    software_size_test = test_X[size_label]
    estimate_effort = np.mean(train_y[rank[:k]]/software_size_train[rank[:k]]) * software_size_test
    return estimate_effort

In [9]:
def rtm(rank, train_y, k, train_X, test_X, categorical_label, size_label, group_label):
    software_size_train = train_X[size_label]
    productivity_train = train_y / software_size_train

    software_size_test = test_X[size_label]
    group_test = test_X[group_label].iloc[0]

    M = productivity_train.loc[(train_X[group_label] == group_test).values].mean()

    all_analogues_productivity = productivity_train * 0
    for i in train_X.index:
        analogues_train = train_X.drop([i])
        analogues_test = train_X.loc[i]
        all_analogues_productivity.loc[i] = productivity_train.iloc[calculate_nn(analogues_train, analogues_test, categorical_label)[0]]

    r, _ = pearsonr(productivity_train.loc[(train_X[group_label] == group_test).values], all_analogues_productivity.loc[(train_X[group_label] == group_test).values],)

    estimate_effort = software_size_test * ( (1/k*software_size_train[rank[:k]].sum()) + (M - (1/k*software_size_train[rank[:k]].sum()) * (1-r)) )

    return estimate_effort

## main analogy

In [36]:
    k=3
    path='albrecht.xlsx'
    effort_label='Effort'
    size_label='AdjFP'
    categorical_label = []
    group_label = []
    to_drop = ['id','FPAdj','RawFP']

    train_X, train_y, test_X, test_y = load_data(path, effort_label, to_drop)
    rank = calculate_nn(train_X, test_X, categorical_label)

    estimate_effort_uavg = uavg(rank, train_y, k)
    estimate_effort_irwm = irwm(rank, train_y, k)
    estimate_effort_lsa = lsa(rank, train_y, k, train_X, test_X, size_label)
#     estimate_effort_rtm = rtm(rank, train_y, k, train_X, test_X, categorical_label, size_label, group_label)

    print({'actual': test_y, 'uavg': estimate_effort_uavg, 'irwm': estimate_effort_irwm, 'lsa': estimate_effort_lsa, 'rtm': estimate_effort_rtm})

    err_uavg = np.abs(estimate_effort_uavg-test_y)
    err_irwm = np.abs(estimate_effort_irwm-test_y)
    err_lsa = np.abs(estimate_effort_lsa-test_y)
#     err_rtm = np.abs(estimate_effort_rtm-test_y)

    print({'uavg err': err_uavg, 'irwm err': err_irwm, 'lsa err': err_lsa, 'rtm err': err_rtm})

{'actual': 6.1, 'uavg': 7.366666666666667, 'irwm': 8.35, 'lsa': 8.144699694752541, 'rtm': 2878.1471786356806}
{'uavg err': 1.2666666666666675, 'irwm err': 2.25, 'lsa err': 2.0446996947525413, 'rtm err': 21061.85282136432}


## ML anology

In [28]:

    path='albrecht.xlsx'
    effort_label='Effort'
    size_label='AdjFP'
    categorical_label = []
    to_drop = ['id','FPAdj','RawFP']

    train_X, train_y, test_X, test_y = load_data(path, effort_label, to_drop)

    clf = tree.DecisionTreeRegressor()
    clf = clf.fit(train_X, train_y)
    estimate_effort_dt = clf.predict(pd.DataFrame(test_X).T)

    clf = linear_model.LinearRegression()
    clf = clf.fit(train_X, train_y)
    estimate_effort_ols = clf.predict(pd.DataFrame(test_X).T)

    clf = svm.SVR()
    clf = clf.fit(train_X, train_y)
    estimate_effort_svr = clf.predict(pd.DataFrame(test_X).T)

    print({'actual': test_y, 'dt': estimate_effort_dt, 'ols': estimate_effort_ols, 'svr': estimate_effort_svr})

    err_dt = np.abs(estimate_effort_dt - test_y)
    err_ols = np.abs(estimate_effort_ols - test_y)
    err_svr = np.abs(estimate_effort_svr - test_y)

    print({'dt err': err_dt, 'ols err': err_ols, 'svr err': err_svr})

{'actual': 6.1, 'dt': array([8.]), 'ols': array([-3.87556928]), 'svr': array([9.82936682])}
{'dt err': array([1.9]), 'ols err': array([9.97556928]), 'svr err': array([3.72936682])}


## Main Main anology

In [37]:
    k = 3
    path='albrecht.xlsx'
    effort_label='Effort'
    size_label='AdjFP'
    categorical_label = []
    group_label = []
    to_drop = ['id','FPAdj','RawFP']

    train_X, train_y, test_X, test_y = load_data(path, effort_label, to_drop)
    rank = calculate_nn(train_X, test_X, categorical_label)

    estimate_effort_uavg = uavg(rank, train_y, k)
    estimate_effort_irwm = irwm(rank, train_y, k)
    estimate_effort_lsa = lsa(rank, train_y, k, train_X, test_X, size_label)

    print({'actual': test_y, 'uavg': estimate_effort_uavg, 'irwm': estimate_effort_irwm, 'lsa': estimate_effort_lsa,
           'rtm': estimate_effort_rtm})

    err_uavg = np.abs(estimate_effort_uavg - test_y)
    err_irwm = np.abs(estimate_effort_irwm - test_y)
    err_lsa = np.abs(estimate_effort_lsa - test_y)


{'actual': 6.1, 'uavg': 7.366666666666667, 'irwm': 8.35, 'lsa': 8.144699694752541, 'rtm': 2878.1471786356806}
