# Imports

In [1]:
import csv
import sys, getopt, os
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn import tree
from learning_union_ktss_master.HierarchicalClusteringKTSS import learn_unions_ktss
from learning_union_ktss_master.KTestable import calculateEIFTC
import pandas as pd
import numpy as np
import random
import datetime
import warnings
warnings.filterwarnings('ignore')


# Methods

In [2]:
def readPredatasetFromFile(f):
    return [line.rstrip('\n') for line in open(f)]

def readPredatasetFromDirectory(path):
    for file in os.listdir(path):
        fileName = file.split(".txt")[0] 
        predataset = readPredatasetFromFile(path + '/' + file)
        for pd in predataset:
            yield fileName, [pd]          

#TODO: refactor needed
def removeExtraSpaces(dataset):
    j = 0
    while j < len(dataset) :
        dataset[j] = dataset[j].rstrip()
        dataset[j] = dataset[j].lstrip()
        dataset[j] = ' '.join(filter(None,dataset[j].split(' ')))        
        j += 1
    return dataset

#TODO: refactor needed
def attachShortTraceToAfter(dataset,k_window):
    newdataset = []
    j = 0
    while j < len(dataset) :
        if len(dataset[j].split(' ')) < k_window:
            if j < len(dataset)-1:
                dataset[j+1] = dataset[j]+' '+dataset[j+1]
        else:
            newdataset.append(dataset[j])    
        j += 1
    return newdataset

#TODO: refactor needed
def removeShortTrace(dataset,k_window):
    newdataset = []
    j = 0
    while j < len(dataset) :
        if len(dataset[j].split(' ')) >= k_window:
            newdataset.append(dataset[j])    
        j += 1
    return newdataset

def process(dataset,k_window,isTrain):
    newdataset = removeExtraSpaces(dataset)
    if isTrain:
        newdataset = attachShortTraceToAfter(newdataset,k_window) 
    else:
        newdataset = removeShortTrace(newdataset,k_window) 
    return removeExtraSpaces(newdataset)

def calculateEIFTC(X,k):
    kminus1 = k-1
    
    # Σ(X) is the alphabet used in X
    sigma = []
    
    for x in X:
        sigma = list(set(sigma + list(x.split(' '))))
    
    # I(S)= Σ(S)k-1 ∩ Pref(S)
    I = {}
    for x in X:
        xs = x.split(' ')
        if len(xs)>=k-1:
            I[" ".join(xs[:kminus1])] = None

    # F(S)= Σ(S)k-1 ∩ Suff(S)
    F = {}
    if k > 1:
        for x in X:
            xs = x.split(' ')
            if len(xs)>=k-1:
                F[" ".join(xs[-kminus1:])] = None

    # T(S)= Σ(S)k ∩ {v: uvw in S}
    T = {}
    for x in X:
        xs = x.split(' ')

        if len(xs)<k:
            continue
        start = 0
        end   = k
        while end <= len(xs):
            T[" ".join(xs[start:end])] = None
            start += 1
            end   += 1

    C = {}
    for x in X:
        xs = x.split(' ')
        if len(xs)<k:
            C[" ".join(xs)] = None

    return sigma,I.keys(),F.keys(),T.keys(),C.keys()

def getDeltasFromKtssArr(testRow, trainRow):
    T2 = testRow.T
    E2 = testRow.E
    I2 = testRow.I
    F2 = testRow.F
    C2 = testRow.C
    E1 = trainRow.E
    I1 = trainRow.I
    F1 = trainRow.F
    T1 = trainRow .T
    if (len(T2) == 0):
        deltaT2 = 0
    else:
        deltaT2 = len(T2 - T1.keys()) / len(T2)
    if (len(T1.keys()) == 0):
        deltaT1 = 0
    else:
        deltaT1 = len(T1.keys() - T2) / len(T1.keys())
    if (len(set(E2)) == 0):
        deltaE = 0
    else:
        deltaE = len(set(E2) - set(E1)) / len(set(E2))
    if (len(I2) == 0):
        deltaI = 0
    else:
        deltaI = len(I2 - I1.keys()) / len(I2)
    if (len(F2) == 0):
        deltaF = 0
    else:
        deltaF = len(F2 - F1.keys()) / len(F2)
    if (len(set(T2).union(set(T1.keys())))== 0):
        similarityT = 0
    else:
        similarityT = len(set(T2).intersection(set(T1.keys()))) /len(set(T2).union(set(T1.keys())))
    if (len(set(E2).union(set(E1)))== 0):
        similarityE = 0
    else:
        similarityE = len(set(E2).intersection(set(E1))) /len(set(E2).union(set(E1)))
    #ktss_score=100 * int(99 * deltaT2)+  int(99 * deltaT1)
    return deltaT2, deltaT1, deltaE, similarityE

def getDeltas(testRow, trainRow, k):
    T2 = testRow['T']
    E2 = testRow['E']
    I2 = testRow['I']
    F2 = testRow['F']
    C2 = testRow['C']
    E1 = trainRow['ktssArr'].E
    I1 = trainRow['ktssArr'].I
    F1 = trainRow['ktssArr'].F
    T1 = trainRow['ktssArr'].T
    if (len(T2) == 0):
        deltaT2 = 0
    else:
        deltaT2 = len(T2 - T1.keys()) / len(T2)
    if (len(T1.keys()) == 0):
        deltaT1 = 0
    else:
        deltaT1 = len(T1.keys() - T2) / len(T1.keys())
    if (len(set(E2)) == 0):
        deltaE = 0
    else:
        deltaE = len(set(E2) - set(E1)) / len(set(E2))
    if (len(I2) == 0):
        deltaI = 0
    else:
        deltaI = len(I2 - I1.keys()) / len(I2)
    if (len(F2) == 0):
        deltaF = 0
    else:
        deltaF = len(F2 - F1.keys()) / len(F2)
    if (len(set(T2).union(set(T1.keys())))== 0):
        similarityT = 0
    else:
        similarityT = len(set(T2).intersection(set(T1.keys()))) /len(set(T2).union(set(T1.keys())))
    if (len(set(E2).union(set(E1)))== 0):
        similarityE = 0
    else:
        similarityE = len(set(E2).intersection(set(E1))) /len(set(E2).union(set(E1)))
    #ktss_score=100 * int(99 * deltaT2)+  int(99 * deltaT1)
    return deltaT2, deltaT1, deltaE, similarityE

def getCategorizedFileNames(row, trainDf, k_window):
    deltas = trainDf[['filename']].copy()
    deltas['T2'], deltas['T1'], deltas['E'],  deltas['E_Similarity']= zip(*trainDf.apply(lambda x: getDeltas(row, x, k_window), axis=1))
    X = deltas.to_numpy()

    X_train = X[:, 1:]
    Y_train = X[:, 0]
    svmodel =SVC(random_state = 0, kernel = 'linear')
    clf = OneVsRestClassifier(svmodel)
    clf=clf.fit(X_train, Y_train)
    y = clf.predict([[0.0, 0.0, 0.0,1.0] * len(windows)])
    return (y[:1] or [None])[0]

In [3]:
def getDeltasFromKtssArrByFrequency(testRow, trainRow):
    T2 = testRow.T
    E2 = testRow.E
    I2 = testRow.I
    F2 = testRow.F
    C2 = testRow.C
    E1 = trainRow.E
    I1 = trainRow.I
    F1 = trainRow.F
    T1 = trainRow .T
#     print(T1)
#     print("**______________________**")
#     print(T2)
    if (sum(T2.values()) == 0):
        deltaT2 = 0
    else:
        d2= {key: T2[key] - T1.get(key, 0) for key in T2}
        d2= {k: v for k, v in d2.items() if v>0}
        deltaT2 = sum(d2.values()) /sum(T2.values())
    
    if (sum(T1.values()) == 0):
        deltaT1 = 0
    else:
        d1= {key: T1[key] - T2.get(key, 0) for key in T1}
        d1= {k: v for k, v in d1.items() if v>0}
        deltaT1 = sum(d1.values()) /sum(T1.values())
        
    if (len(set(E2)) == 0):
        deltaE = 0
    else:
        deltaE = len(set(E2) - set(E1)) / len(set(E2))
    if (len(I2) == 0):
        deltaI = 0
    else:
        deltaI = len(I2 - I1.keys()) / len(I2)
    if (len(F2) == 0):
        deltaF = 0
    else:
        deltaF = len(F2 - F1.keys()) / len(F2)
    if (len(set(T2).union(set(T1.keys())))== 0):
        similarityT = 0
    else:
        similarityT = len(set(T2).intersection(set(T1.keys()))) /len(set(T2).union(set(T1.keys())))
    if (len(set(E2).union(set(E1)))== 0):
        similarityE = 0
    else:
        similarityE = len(set(E2).intersection(set(E1))) /len(set(E2).union(set(E1)))
    #ktss_score=100 * int(99 * deltaT2)+  int(99 * deltaT1)
    return deltaT2, deltaT1, deltaE, similarityE

# Fill initial parameters

In [4]:
maindir = 'OneFold'
traindir = maindir + '/Train' 
testdir = maindir + '/Test'
k_window = 3

# Load data from input files

### Prepare train data

In [5]:
trainDataset = pd.DataFrame.from_records([{ 'filename': name, 'preProcessedData': data } for name, data in readPredatasetFromDirectory(traindir)])
trainDataset = trainDataset[trainDataset['preProcessedData'].map(lambda d: len(d[0])) > 0]
trainDataset['processed'] = trainDataset['preProcessedData'].apply(lambda x: process(x, k_window, True))
trainDataset = trainDataset[trainDataset['processed'].map(lambda d: len(d)) > 0]
trainDataset['traceId'] = trainDataset.groupby(['filename']).cumcount()+1

APP_COUNT = len(trainDataset['filename'].value_counts())

In [6]:
trainDataset['ktssArr'] = trainDataset['processed'].apply(lambda x: learn_unions_ktss(x, 1,k_window,0,0,None)[1][0])
# trainDataset['appktssArr'] = trainDataset['filename'].apply(lambda x: list(learn_unions_ktss([item for sublist in trainDataset[trainDataset['filename'] == x]['processed'].tolist() for item in sublist], 1,k_window,0,0,None)[1].keys())[0])
trainDataset['appktssArr'] = trainDataset['filename'].apply(lambda x: list(learn_unions_ktss([item for sublist in trainDataset[trainDataset['filename'] == x]['processed'].tolist() for item in sublist], 1,k_window,0,0,None)[1])[0])

trainDataset.head()

Unnamed: 0,filename,preProcessedData,processed,traceId,ktssArr,appktssArr
0,com.microsoft.skydrive_all,[S119 S6004 S2930 S2930 S2930 S6009 S6009 S600...,[S119 S6004 S2930 S2930 S2930 S6009 S6009 S600...,1,"['S119', 'S137', 'S205', 'S22', 'S25', 'S29', ...","['S104', 'S119', 'S135', 'S136', 'S137', 'S166..."
1,com.microsoft.skydrive_all,[S8 S3005 S3953 S29 S2930 S2930 S6028 S8 S8 S2...,[S8 S3005 S3953 S29 S2930 S2930 S6028 S8 S8 S2...,2,"['S29', 'S2930', 'S2997', 'S3005', 'S3056', 'S...","['S104', 'S119', 'S135', 'S136', 'S137', 'S166..."
2,com.microsoft.skydrive_all,[S6030 S8 S6036 S8 S8 S4934 S4934 S4934 S4934 ...,[S6030 S8 S6036 S8 S8 S4934 S4934 S4934 S4934 ...,3,"['S104', 'S137', 'S2997', 'S3005', 'S3049', 'S...","['S104', 'S119', 'S135', 'S136', 'S137', 'S166..."
3,com.microsoft.skydrive_all,[S6030 S8 S607 S8 S8 S6045 S6045 S8 S6045 S17 ...,[S6030 S8 S607 S8 S8 S6045 S6045 S8 S6045 S17 ...,4,"['S136', 'S137', 'S17', 'S28', 'S2997', 'S31',...","['S104', 'S119', 'S135', 'S136', 'S137', 'S166..."
4,com.microsoft.skydrive_all,[S8 S8 S8 S331 S5 S8 S2918 S5 S17 S2918 S95 S1...,[S8 S8 S8 S331 S5 S8 S2918 S5 S17 S2918 S95 S1...,5,"['S17', 'S2918', 'S2969', 'S2997', 'S3049', 'S...","['S104', 'S119', 'S135', 'S136', 'S137', 'S166..."


# Create Learning Model

In [7]:
# copy train dataset
from sklearn.linear_model import LogisticRegression
dfTrain = trainDataset.copy()
dfTrain = dfTrain[['filename', 'ktssArr', 'appktssArr','traceId']]
cpTrain = dfTrain.copy().rename(columns={'filename': 'compareTo', 'appktssArr': 'ktssArr_compareTo'})
cpTrain = cpTrain[cpTrain['traceId'] == 1]
dfTrain = dfTrain.drop(columns=['appktssArr']) 

cpTrain = cpTrain.drop(columns=['traceId','ktssArr'])


dfTrain['key'] = 0
cpTrain['key'] = 0

dfCrossed = dfTrain.merge(cpTrain, on='key', how='outer')

dfCrossed = dfCrossed.drop(columns=['key'])
dfCrossed['deltaT2'], dfCrossed['deltaT1'], dfCrossed['deltaE'], dfCrossed['similarityE'] = zip(*dfCrossed.apply(lambda x: getDeltasFromKtssArr(x['ktssArr'], x['ktssArr_compareTo']), axis=1))
dfCrossed = dfCrossed.drop(columns=['ktssArr', 'ktssArr_compareTo' ])

In [8]:

dff=dfCrossed.pivot_table(index=['filename','traceId'], columns=['compareTo'], values=['deltaT2', 'deltaT1', 'deltaE', 'similarityE'])
dff.columns = ['_'.join(col) for col in dff.columns]
dff.reset_index(level=0, inplace=True)


In [9]:
dff

Unnamed: 0_level_0,filename,deltaE_com.PregnancyCalendar_all,deltaE_com.SkyDivers.petals3d_all,deltaE_com.Slack_all,deltaE_com.cleanmaster.security_all,deltaE_com.cmplay.tiles2_all,deltaE_com.codecomputerlove.higherlowergame_all,deltaE_com.com2us.smon.normal.freefull.google.kr.android.common_all,deltaE_com.commsource.beautyplus_all,deltaE_com.contextlogic.wish_all,...,similarityE_emoji.keyboard.emoticonkeyboard_all,similarityE_fm.player_all,similarityE_jp.naver.linecamera.android_all,similarityE_mobi.infolife.ezweather_all,similarityE_mobi.mgeek.TunnyBrowser_all,similarityE_mrigapps.andriod.fuelcons_all,similarityE_music.bassbooster.equalizer_all,similarityE_net.zedge.android_all,similarityE_org.mozilla.firefox_all,similarityE_vsin.t16_funny_photo_all
traceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,com.PregnancyCalendar_all,0.000000,0.300000,0.700000,0.600000,0.500000,0.600000,0.500000,0.600000,0.700000,...,0.035088,0.024242,0.039216,0.018018,0.035088,0.035398,0.035088,0.017241,0.033333,0.037736
2,com.PregnancyCalendar_all,0.000000,0.312500,0.750000,0.687500,0.562500,0.625000,0.812500,0.625000,0.812500,...,0.050847,0.017442,0.072727,0.017094,0.033333,0.051282,0.083333,0.033333,0.031746,0.054545
3,com.PregnancyCalendar_all,0.000000,0.333333,0.619048,0.619048,0.714286,0.571429,0.666667,0.857143,0.809524,...,0.066116,0.034483,0.049180,0.024793,0.032000,0.066667,0.147541,0.048780,0.015038,0.100000
4,com.PregnancyCalendar_all,0.000000,0.450000,0.600000,0.750000,0.650000,0.650000,0.700000,0.800000,0.750000,...,0.057851,0.022857,0.032787,0.016529,0.032258,0.067227,0.112903,0.040650,0.022901,0.071429
5,com.PregnancyCalendar_all,0.000000,0.466667,0.666667,0.733333,0.533333,0.600000,0.600000,0.733333,0.666667,...,0.051282,0.017544,0.035714,0.017241,0.033613,0.051724,0.103448,0.051282,0.023810,0.084906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,vsin.t16_funny_photo_all,0.555556,0.277778,0.555556,0.388889,0.500000,0.555556,0.500000,0.833333,0.666667,...,0.058824,0.017241,0.070175,0.043103,0.041322,0.050420,0.116667,0.067797,0.031250,0.180000
7,vsin.t16_funny_photo_all,0.650000,0.350000,0.600000,0.600000,0.500000,0.600000,0.750000,0.650000,0.650000,...,0.057851,0.028736,0.145455,0.051282,0.057851,0.040984,0.095238,0.057851,0.055118,0.200000
8,vsin.t16_funny_photo_all,0.681818,0.454545,0.681818,0.727273,0.681818,0.545455,0.772727,0.818182,0.727273,...,0.083333,0.028409,0.101695,0.033058,0.040000,0.032000,0.075758,0.048387,0.038168,0.220000
9,vsin.t16_funny_photo_all,0.560000,0.480000,0.760000,0.680000,0.640000,0.680000,0.600000,0.760000,0.840000,...,0.072581,0.045455,0.079365,0.032258,0.031008,0.039370,0.138462,0.047244,0.029630,0.250000


In [10]:
X = dff.to_numpy()
X_train = X[:,1:]
Y_train = X[:, 0]
X 

array([['com.PregnancyCalendar_all', 0.0, 0.3, ..., 0.017241379310344827,
        0.03333333333333333, 0.03773584905660377],
       ['com.PregnancyCalendar_all', 0.0, 0.3125, ...,
        0.03333333333333333, 0.031746031746031744, 0.05454545454545454],
       ['com.PregnancyCalendar_all', 0.0, 0.3333333333333333, ...,
        0.04878048780487805, 0.015037593984962405, 0.1],
       ...,
       ['vsin.t16_funny_photo_all', 0.6818181818181818,
        0.45454545454545453, ..., 0.04838709677419355,
        0.03816793893129771, 0.22],
       ['vsin.t16_funny_photo_all', 0.56, 0.48, ...,
        0.047244094488188976, 0.02962962962962963, 0.25],
       ['vsin.t16_funny_photo_all', 0.6875, 0.5625, ...,
        0.04201680672268908, 0.04, 0.16]], dtype=object)

# Prepare test data

In [11]:
testDataset = pd.DataFrame.from_records([{ 'filename': name, 'preProcessedData': data } for name, data in readPredatasetFromDirectory(testdir)])
testDataset['processed'] = testDataset['preProcessedData'].apply(lambda x: process(x, k_window, False))
testDataset = testDataset[testDataset['processed'].map(lambda d: len(d)) > 0]
testDataset['ktssArr'] = testDataset['processed'].apply(lambda x: learn_unions_ktss(x, 1,k_window,0,0,None)[1][0])
testDataset['traceId'] = testDataset.groupby(['filename']).cumcount()+1

# testDataset

# Extract features for test dataset

In [12]:
# copy test dataset
df1 = testDataset.copy()
df1 = df1[['filename', 'ktssArr','traceId']]
# cp = df1.copy().rename(columns={'filename': 'compareTo', 'ktssArr': 'ktssArr_compareTo'})
df1['key'] = 0
# cp['key'] = 0
dfCrossed_test = df1.merge(cpTrain, on='key', how='outer')
dfCrossed_test = dfCrossed_test.drop(columns=['key'])
dfCrossed_test['deltaT2'], dfCrossed_test['deltaT1'], dfCrossed_test['deltaE'], dfCrossed_test['similarityE'] = zip(*dfCrossed_test.apply(lambda x: getDeltasFromKtssArr(x['ktssArr'], x['ktssArr_compareTo']), axis=1))
dfCrossed_test = dfCrossed_test.drop(columns=['ktssArr', 'ktssArr_compareTo' ])
dff_test=dfCrossed_test.pivot_table(index=['filename','traceId'], columns=['compareTo'], values=['deltaT2', 'deltaT1', 'deltaE', 'similarityE'])
dff_test.columns = ['_'.join(col) for col in dff_test.columns]
dff_test.reset_index(level=0, inplace=True)


In [13]:
dff_test

Unnamed: 0_level_0,filename,deltaE_com.PregnancyCalendar_all,deltaE_com.SkyDivers.petals3d_all,deltaE_com.Slack_all,deltaE_com.cleanmaster.security_all,deltaE_com.cmplay.tiles2_all,deltaE_com.codecomputerlove.higherlowergame_all,deltaE_com.com2us.smon.normal.freefull.google.kr.android.common_all,deltaE_com.commsource.beautyplus_all,deltaE_com.contextlogic.wish_all,...,similarityE_emoji.keyboard.emoticonkeyboard_all,similarityE_fm.player_all,similarityE_jp.naver.linecamera.android_all,similarityE_mobi.infolife.ezweather_all,similarityE_mobi.mgeek.TunnyBrowser_all,similarityE_mrigapps.andriod.fuelcons_all,similarityE_music.bassbooster.equalizer_all,similarityE_net.zedge.android_all,similarityE_org.mozilla.firefox_all,similarityE_vsin.t16_funny_photo_all
traceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,com.PregnancyCalendar_all,0.272727,0.454545,0.727273,0.727273,0.363636,0.636364,0.727273,0.727273,0.727273,...,0.025862,0.017964,0.038462,0.017857,0.034783,0.035088,0.034483,0.025862,0.033058,0.027778
2,com.PregnancyCalendar_all,0.000000,0.000000,0.250000,0.500000,0.500000,0.250000,0.750000,0.500000,0.750000,...,0.037037,0.006173,0.021739,0.019048,0.018182,0.009091,0.060000,0.037037,0.008547,0.040000
3,com.PregnancyCalendar_all,0.125000,0.437500,0.562500,0.625000,0.687500,0.562500,0.625000,0.750000,0.687500,...,0.059829,0.017442,0.053571,0.017094,0.042017,0.042373,0.083333,0.033333,0.031746,0.064220
4,com.PregnancyCalendar_all,0.277778,0.555556,0.777778,0.833333,0.666667,0.722222,0.888889,0.833333,0.777778,...,0.032787,0.017241,0.051724,0.008333,0.016129,0.050420,0.046875,0.016129,0.023256,0.063063
5,com.PregnancyCalendar_all,0.250000,0.500000,0.750000,0.875000,0.625000,0.750000,0.750000,0.875000,0.750000,...,0.059829,0.005747,0.035088,0.017094,0.033333,0.088496,0.101695,0.033333,0.023622,0.054545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6,vsin.t16_funny_photo_all,0.647059,0.352941,0.647059,0.647059,0.588235,0.588235,0.588235,0.764706,0.647059,...,0.041667,0.011494,0.090909,0.025641,0.041667,0.033333,0.081967,0.033058,0.031496,0.158416
7,vsin.t16_funny_photo_all,0.631579,0.315789,0.631579,0.684211,0.578947,0.526316,0.789474,0.842105,0.631579,...,0.040984,0.028902,0.050847,0.042735,0.032520,0.041322,0.096774,0.058333,0.039062,0.166667
8,vsin.t16_funny_photo_all,0.736842,0.473684,0.631579,0.578947,0.526316,0.631579,0.684211,0.789474,0.789474,...,0.049587,0.034884,0.087719,0.042735,0.040984,0.016129,0.114754,0.049587,0.039062,0.190000
9,vsin.t16_funny_photo_all,0.555556,0.555556,0.666667,0.555556,0.444444,0.611111,0.722222,0.777778,0.666667,...,0.032787,0.035088,0.070175,0.043103,0.024390,0.024590,0.063492,0.041322,0.039370,0.145631


In [14]:
X = dff_test.to_numpy()
X_test= X[:,1:]
Y_test = X[:, 0]
 

# Define Models

In [15]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsRegressor

def models(xtrain, xtest, ytrain, ytest):
    
    #logistic regression
    lrmodel = LogisticRegression(random_state = 0)
    lrmodel.fit(xtrain, ytrain)

    
#     decision tree
    dtmodel = tree.DecisionTreeClassifier()
    dtmodel.fit(xtrain, ytrain)

    
    #random forest
    rfmodel = RandomForestClassifier(max_depth = 8, random_state = 42)
    rfmodel.fit(xtrain, ytrain)

    
    #SVM
    svm = SVC()
    svmodel = OneVsRestClassifier(svm)
    svmodel=svmodel.fit(xtrain, ytrain)

    
#     Gradient boosting classifier
    gbmodel = GradientBoostingClassifier(random_state = 0)
    gbmodel.fit(xtrain, ytrain)

    
    return lrmodel,dtmodel,rfmodel,svmodel,gbmodel
#     return lrmodel,svmodel


In [16]:
lr, dt, rf, svm, gb = models(X_train, X_test, Y_train, Y_test)
# lr, svm= models(X_train, X_test, Y_train, Y_test)

# Compare Models

In [17]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
df_scores=pd.DataFrame({'model': ['logistic regression', 'decision tree', 'random forest', 'SVM', 'Gradient boosting classifier'],
                        'score': [lr.score(X_test,Y_test), dt.score(X_test,Y_test), rf.score(X_test,Y_test),svm.score(X_test,Y_test), gb.score(X_test,Y_test)],
                        'recall':[recall_score(Y_test, lr.predict(X_test), average='weighted'),recall_score(Y_test, dt.predict(X_test), average='weighted'),recall_score(Y_test, rf.predict(X_test), average='weighted'),recall_score(Y_test, svm.predict(X_test), average='weighted'),recall_score(Y_test, gb.predict(X_test), average='weighted')],
                        'precision':[precision_score(Y_test, lr.predict(X_test), average='weighted'),precision_score(Y_test, dt.predict(X_test), average='weighted'),precision_score(Y_test, rf.predict(X_test), average='weighted'),precision_score(Y_test, svm.predict(X_test), average='weighted'),precision_score(Y_test, gb.predict(X_test), average='weighted')],
                       'f1':[f1_score(Y_test, lr.predict(X_test), average='weighted'),f1_score(Y_test, dt.predict(X_test), average='weighted'),f1_score(Y_test, rf.predict(X_test), average='weighted'),f1_score(Y_test, svm.predict(X_test), average='weighted'),f1_score(Y_test, gb.predict(X_test), average='weighted')]
                       })


## Classification_report

In [18]:
def ktssTrain(traindir,k_window):
             
    datasetArr = {}
    predatasetLengths = {}
    datasetLengths = {}
    filenameArr = []
    #dfaArr={}
    ktssArr={}
    
    for file in os.listdir(traindir):
        filename = file.split(".txt")[0] 
        #print(file,"\n")
        filenameArr.append(filename)
        predataset = [line.rstrip('\n') for line in open(traindir + '/' + file)]
        predatasetLengths[filename]=len(predataset)
        dataset = preprocess(traindir,predataset,filename,k_window,True)
        datasetLengths[filename]=[len(dataset)]
        
              
        if len(dataset) > 0:
            #print(filename+" with "+str(len(newdataset))+" traces")        
            datasetArr[filename] = dataset 
            labels, dict_ktss  = learn_unions_ktss(dataset,1,k_window,0,0,"output\plot.pdf")

            for ktss in dict_ktss:
#                print(ktss)
                #dfa = learnKtestable_fromEIFTC(ktss.E,ktss.I,ktss.F,ktss.T,ktss.C,k_window)
                #dfaArr[filename] = dfa
                ktssArr[filename] = ktss
     
#        else:
#            print(filename + "*** Error: Inputs are not sufficient!")

    return ktssArr,filenameArr,predatasetLengths,datasetLengths


def preprocess(PATH,dataset,filename,k_window,isTrain):
    
    
    newdataset = removeExtraSpaces(dataset)
    if isTrain:
        newdataset = attachShortTraceToAfter(newdataset,k_window) 
    else:
        newdataset = removeShortTrace(newdataset,k_window) 

    newdataset = removeExtraSpaces(newdataset)
#    checkTwoSuccessiveSpaces(newdataset,k_window)
    
    # os.makedirs(PATH+"-chunked",exist_ok=True)
    # outfile = open(PATH+"-chunked/"+filename+'.txt', 'w')
    # for trace in newdataset:
    #     outfile.write(str(trace)+"\n")
    # outfile.close()
    return newdataset





def ktssTest(ktssArr,filenameArr,predatasetLengths,datasetLengths,testdir,k_window,outputdir):
    os.makedirs (outputdir,exist_ok=True)

    resultfile = open(outputdir+"/all_k_results.txt", 'a')


    testArr = {}
    predatasetTestLengths = {}
    recalls={}
    precisions={}
    F1s={}
    accuracys={}

#    for file in os.listdir(testdir):
#
#        filename = file.split(".txt")[0]

    for filename in filenameArr:

        if (os.path.isfile(testdir + '/' + filename+'.txt')):
            recalls[filename]=[]
            precisions[filename]=[]
            F1s[filename]=[]
            accuracys[filename]=[]


            predatasetTest = [line.rstrip('\n') for line in open(testdir + '/' + filename+'.txt')]
            predatasetTestLengths[filename]=len(predatasetTest)
            datasetTest = preprocess(testdir,predatasetTest,filename,k_window,False)
            testArr[filename] = datasetTest
#            print(filename,testArr[filename])
        else:
            filenameArr.remove(filename)

    y_test = []
    y_pred = []
    classes = []
    dists = []
    # I_set, F_set, T_set = feature_extraction(ktssArr, testArr, k_window)
    # clf = learning_model(ktssArr, I_set, F_set, T_set)
    # test_model(clf, testArr, k_window, I_set, F_set, T_set)
    for filename in testArr:

        classes.append(filename)

        X_test = testArr[filename]

        for j in range(len(X_test)):
            y_test.append(filename)

        y_pred.extend(categorize(ktssArr, X_test, filename, k_window, outputdir,dists))
        #y_pred.extend(categorize(dfaArr,X_test,filename,k_window))
    # saveDistances(dists)
    cm = confusion_matrix(y_test, y_pred)
    #print(classification_report(y_test, y_pred))

    for x in range(len(classes)):
         col = cm[:, x]
         if col.sum() == 0:
             precision= 0
         else:
             precision = cm[x, x] / col.sum()
         precisions[classes[x]].append(precision)

         row = cm[x, :]
         if row.sum() == 0:
             recall= 0
         else:
             recall = cm[x, x] / row.sum()
         recalls[classes[x]].append(recall)

         if (precision + recall) == 0:
             F1 = 0
         else:
             F1 = 2 * (precision * recall) / (precision + recall)
         F1s[classes[x]].append(F1)

         accuracy = accuracy_score(y_test, y_pred)
         accuracys[classes[x]].append(accuracy)


    resultfile.write(str(cm)+"\n")


    result =[]

    for filename in filenameArr:
        avgPr = round(sum(precisions[filename])/len(precisions[filename]),2)
        avgRecalls = round(sum(recalls[filename]) / len(recalls[filename]),2)
        avgF1s = round(sum(F1s[filename]) / len(F1s[filename]),2)
        avgAcc = round(sum(accuracys[filename])/len(accuracys[filename]),2)
        resultfile.write(filename+", "+str(avgRecalls)+', '+str(avgPr)+', '+str(avgF1s)+', '+str(avgAcc)+', '+str(predatasetLengths[filename])+', '+str(datasetLengths[filename])+"\n")

        newRow = [filename,avgRecalls,avgPr,avgF1s,avgAcc,predatasetLengths[filename],datasetLengths[filename][0],len(testArr[filename])]
        result.append(newRow)

        df = pd.DataFrame(data=result,columns=['appName','recall','pr','f1','acc','tr','newTr','tests'])

    resultfile.write("-----------------------------------------------------------------------------------------------------\n")
    resultfile.close()

    conflictfile = open(outputdir+'/conflicts.txt', 'a')
    conflictfile.write("-----------------------------------------------------------------------------------------------------\n")
    conflictfile.close()

    return df, avgAcc
def categorize(ktssArr,X_test,filenamet,k_window, outputdir,dists):
    
    outfile = open(outputdir+'/conflicts.txt', 'a')  

    y_pred=[]

    for j in range(len(X_test)):
#        print("categorize: ",X_test[j])

        E2,I2,F2,T2,C2 = calculateEIFTC([X_test[j]],k_window)
      
#        print(E2,I2,F2,T2,C2 )
        points=[]
        distances={}
        distances2={}

#        print("before",ktssArr)
#        for filename in dfaArr:
        for filename in ktssArr:

#            print(filename)
            E1 = ktssArr[filename].E
            I1 = ktssArr[filename].I
            F1 = ktssArr[filename].F
            T1 = ktssArr[filename].T
 
            
            distances[filename] = 100 * int(99 * len(T2-T1)/len(T2))
            distances[filename] += 1 * int(99 * len(T1-T2)/len(T1))
            # distances[filename] += 10000 * int(99 * len(set(E2)-set(E1))/len(set(E2)))
            # distances[filename] += 100 * int(99 * len(I2.keys()-I1.keys())/len(I2.keys()))
            # distances[filename] += int(99 * len(F2.keys()-F1.keys())/len(F2.keys()))
                        
        minval = min(distances.values())
        categorizedFileList = [k for k, v in distances.items() if v==minval]
        if len(categorizedFileList)>1:
#            print(str(k_window)+" "+filenamet+" "+str(minval)+" VS. "+str(distances2[filename])+" Oh! Multiple categories Founded: "+str(categorizedFileList))
            outfile.write(str(k_window)+" "+filenamet+" "+str(minval)+" Oh! Multiple categories Founded: "+str(categorizedFileList)+"\n")

#            for item in categorizedFileList:
#                print(item+" : "+str(distances2[item])+" ")

            index = random.randrange(0, len(categorizedFileList))
            y_pred.append(categorizedFileList[index])
            categorizedFile = categorizedFileList[index]
        else:
            categorizedFile =min(distances, key=lambda k: distances[k])
            y_pred.append(categorizedFileList[0])
        distance_itself=[value for key, value in distances.items() if key == filenamet ]
        distances={k: v for k, v in sorted(distances.items(), key=lambda item: item[1])}
        points=list(distances.keys())
        dists.append({'file':filenamet ,'as': categorizedFile, 'distance': minval,'distance_itself':distance_itself[0],'interval':distance_itself[0]-minval,'point':points.index(filenamet)})
#         if categorizedFile != filenamet:
#             print("--------- categorized wrong_by_ktss")
#             print(str(filenamet) + "categorized as   " + str(categorizedFile))



            
    outfile.close()

    return y_pred

In [19]:
ktssArr, filenameArr, predatasetLengths, datasetLengths = ktssTrain(traindir, k_window)
df, avgAcc = ktssTest(ktssArr, filenameArr, predatasetLengths, datasetLengths, testdir, k_window, maindir)

In [20]:
distance_fun={'model':'distance_function','score':avgAcc,'recall':df['recall'].mean(),'precision':df['pr'].mean(),'f1':df['f1'].mean()}
df_scores = df_scores.append(distance_fun, ignore_index=True)

In [21]:
df_scores

Unnamed: 0,model,score,recall,precision,f1
0,logistic regression,0.956957,0.956957,0.961263,0.956757
1,decision tree,0.344344,0.344344,0.609747,0.389973
2,random forest,0.702703,0.702703,0.931331,0.759878
3,SVM,0.967968,0.967968,0.969986,0.96637
4,Gradient boosting classifier,0.535536,0.535536,0.881718,0.609729
5,distance_function,0.86,0.857,0.883,0.8555
