In [76]:
import numpy as np
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [77]:
df = pd.read_csv('dataUser2.csv')

In [78]:
def get_clean_data(df):
    return df[df['ID_INPUT'] == 4].drop(columns = ['Unnamed: 0']).reset_index(drop = True)

In [79]:
def get_unique_states(X):
    return np.unique(X)

In [80]:
def get_conditional_prob(df):
    conditional_prob = df[df['ID_INPUT'] == 4]['VALUE'].value_counts() / len(df[df['ID_INPUT'] == 4])
    return conditional_prob

In [81]:
def get_all_pairs(df):
    pairs = []
    for index in range(len(df) - 1):
        pair = (df.iloc[index]['VALUE'], df.iloc[index+1]['VALUE'])
        pairs.append(pair)
        
    return pairs

In [82]:
def split_train_test(pairs):
    X = [x[0] for x in pairs]
    y = [x[1] for x in pairs]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    return [X_train, y_train, X_test, y_test]

In [83]:
def get_pair_frequency(X, y):
    pair_freq = defaultdict(int)
    for index in range(len(X)):
        pair = (X[index], y[index])
        pair_freq[pair] += 1
    return pair_freq

In [84]:
def get_transitional_probability(pair_freq, X):
    transitional_prob = defaultdict(int)
    for pair in pair_freq:
        total_occ = sum([x == pair[0] for x in X])
        #next_app_prob = defaultdict(int)
        #next_app_prob[pair[1]] = pair_freq[pair] / total_occ
        transitional_prob[pair] += pair_freq[pair] / total_occ
    return transitional_prob

In [85]:
def get_transitional_matrix(trans_prob, X):
    all_exes = get_unique_states(X)
    probs_for_matrix = []
    for col in all_exes:
        exe_probs = []
        for row in all_exes:
            pair = (col, row)
            if pair in trans_prob:
                exe_probs.append(trans_prob[pair])
            else:
                exe_probs.append(0)
        probs_for_matrix.append(exe_probs)
    
    matrix = pd.DataFrame(probs_for_matrix, index = all_exes, columns = all_exes)
    return matrix

In [86]:
def get_n_next_app(n, matrix, app):
    matrix = matrix.T
    if app in matrix.columns:
        top_vals = matrix.nlargest(n, app).index
        return top_vals
    else:
        return ['chrome.exe'] #if app not in matrix return most common app

In [87]:
def get_accuracy(X, y, matrix, n):
    preds = []
    for i in range(len(X)):
        pred = get_n_next_app(n, matrix, X[i])
        if y[i] in pred:
            preds.append(1)
        else:
            preds.append(0)
            
    return sum(preds)/ len(preds)
        

In [92]:
def predict_HMM(df, n):
    
    df = get_clean_data(df)
    
    conditional_prob = get_conditional_prob(df)
    #print(conditional_prob)
    
    all_pairs = get_all_pairs(df)
    
    X_tr, y_tr, X_test, y_test = split_train_test(all_pairs)
    pair_freq = get_pair_frequency(X_tr, y_tr)
    transitional_prob = get_transitional_probability(pair_freq, X_tr)
    transitional_matrix = get_transitional_matrix(transitional_prob, X_tr)
    
    accuracy = get_accuracy(X_test, y_test, transitional_matrix, n)
    return [matrix, accuracy]

In [104]:
matrix, accuracy = predict_HMM(df, 4)

In [105]:
accuracy

0.7628865979381443

In [101]:
matrix

Unnamed: 0,Acrobat.exe,ApplicationFrameHost.exe,CredentialUIBroker.exe,DB Browser for SQLite.exe,Docker Desktop.exe,LockApp.exe,Messenger.exe,MoNotificationUx.exe,OneDrive.exe,PickerHost.exe,...,Zoom.exe,chrome.exe,cmd.exe,conhost.exe,devenv.exe,explorer.exe,msedge.exe,msiexec.exe,msteams.exe,python-3.11.1-amd64.exe
Acrobat.exe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ApplicationFrameHost.exe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,...,0.045455,0.454545,0.045455,0.0,0.045455,0.090909,0.090909,0.045455,0.0,0.0
CredentialUIBroker.exe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB Browser for SQLite.exe,0.0,0.0,0.0,0.04,0.0,0.0,0.02,0.0,0.0,0.0,...,0.02,0.22,0.0,0.0,0.0,0.7,0.0,0.0,0.0,0.0
Docker Desktop.exe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
LockApp.exe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Messenger.exe,0.0,0.029851,0.0,0.014925,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.567164,0.0,0.0,0.014925,0.149254,0.029851,0.0,0.0,0.0
MoNotificationUx.exe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
OneDrive.exe,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PickerHost.exe,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
matrix.to_csv('output.txt', index=True)

In [74]:
matrix.to_csv('output.csv', index=True)
    