# Setup

In [1]:
import os
import json
import random
import sqlite3

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from collections import defaultdict

import matplotlib as plt

# Data Loading

In [3]:
df1 = pd.read_csv('dataUser1.csv')

In [17]:
df2 = pd.read_csv('dataUser2.csv') 

# Data Preprocessing for HMM's Transition Matrix

In [4]:
def get_clean_data(df):
    """Get only the column containing the exe files"""
    return df[df['ID_INPUT'] == 4].drop(columns = ['Unnamed: 0']).reset_index(drop = True)

def get_all_pairs(df):
    """Get pairs of exe files happening next to each other"""
    pairs = [('S0', df.iloc[0]['VALUE'])] # initial pair would be (delimiter, first exe)
    #pairs = []
    for index in range(len(df) - 1):
        pair = (df.iloc[index]['VALUE'], df.iloc[index+1]['VALUE'])
        pairs.append(pair)       
    return pairs

def split_train_test(pairs, state):
    """Split train/test sets by a ratio of 80/20"""
    X = [x[0] for x in pairs] # x[0] is ~ the "current" exe file
    y = [x[1] for x in pairs] # x[1] is the "next" exe file
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=state)
    return [X_train, y_train, X_test, y_test]

# Transition Matrix
- Firstly, we find the frequencies of all pairs of exe files via the function *get_pair_frequency*
- Using such information, we can find the transition probability

In [5]:
def get_pair_frequency(X, y):
    """Get the frequency of the pairs of exe files that happen next to each other"""
    pair_freq = defaultdict(int)
    for index in range(len(X)):
        pair = (X[index], y[index])
        pair_freq[pair] += 1
    return pair_freq

def get_transition_probability(pair_freq, X):
    """Get the transition probability, for ex: from chrome.exe --> cmd.exe,
       P(cmd.exe | chrome.exe) = P(chrome.exe, cmd.exe) / P(chrome.exe)
                               = (# chrome.exe and cmd.exe) / (# all occurrences of chrome.exe)
                               = (pair occurrences) / (# all occurrences of chrome.exe)
    """
    transition_prob = defaultdict(int)
    for pair in pair_freq:
        total_occ = sum([x == pair[0] for x in X])
        transition_prob[pair] += pair_freq[pair] / total_occ
    return transition_prob

- Now, we create the transition matrix

In [6]:
def get_unique_states(X):
    """Get the unique executables (i.e. the "states" in HMM)"""
    return np.unique(X)
    
def get_transition_matrix(trans_prob, X):
    """Create the transition matrix"""
    all_exes = get_unique_states(X)
    probs_for_matrix = []
    for row in all_exes:
        exe_probs = []
        for col in all_exes:
            pair = (row, col)
            if pair in trans_prob:
                exe_probs.append(trans_prob[pair])
            else:
                exe_probs.append(0)
        probs_for_matrix.append(exe_probs)
    
    matrix = pd.DataFrame(probs_for_matrix, index = all_exes, columns = all_exes)
    return matrix

# HMM Model + Accuracy

In [7]:
def get_n_next_app(n, matrix, app):
    """ Find n next apps whose immediate previous app is "app" """
    matrix = matrix.T
    if app in matrix.columns:
        top_vals = matrix.nlargest(n, app).index # nlargest: Return the first n rows ordered by columns in descending order.
        return top_vals
    else:
        return ['chrome.exe'] # if app not in matrix return the most common app

def get_accuracy(X, y, matrix, n):
    """Accuracy of the HMM model"""
    preds = []
    for i in range(len(X)):
        pred = get_n_next_app(n, matrix, X[i])
        if y[i] in pred:
            preds.append(1)
        else:
            preds.append(0)        
    return sum(preds)/ len(preds)

def predict_HMM(df, n, rand_state):
    """Put everything together for the HMM model"""
    df = get_clean_data(df)
    all_pairs = get_all_pairs(df)
    
    X_tr, y_tr, X_test, y_test = split_train_test(all_pairs, rand_state)
    pair_freq = get_pair_frequency(X_tr, y_tr)
    transition_prob = get_transition_probability(pair_freq, X_tr)
    transition_matrix = get_transition_matrix(transition_prob, X_tr)
    
    accuracy = get_accuracy(X_test, y_test, transition_matrix, n)
    return [transition_matrix, accuracy]

def save_to_file(matrix, outfile_name, outfile_ext = 'csv'):
    """Save matrices to files after finding them"""
    out = outfile_name + '.' + outfile_ext
    matrix.to_csv(out, index=True)
    

# Sample Runs of Transition Matrix and Accuracy

### For user 1
- Note: 
    - The below accuracies were obtained by running this notebook on Jan 28, 2023. 
    - Data from user 1 were collected up to Jan 27, 2023. 
    - If you run the code with a different dataset after this date, the results will be varied.

In [59]:
matrix1, accuracy1 = predict_HMM(df1, n=1, rand_state=20)
save_to_file(matrix1, "outputs/HMM/transition_mt_user1_top1app", outfile_ext = 'txt')
print(accuracy1)

0.4921190893169877


In [9]:
matrix1, accuracy1 = predict_HMM(df1, n=2, rand_state=20)
print(accuracy1)

0.6742556917688266


In [10]:
matrix1, accuracy1 = predict_HMM(df1, 5, 20)
print(accuracy1)

0.8774080560420315


In [11]:
matrix1, accuracy1 = predict_HMM(df1, 10, 20)
print(accuracy1)

0.9632224168126094


In [60]:
matrix1, accuracy1 = predict_HMM(df1, 15, 20)
save_to_file(matrix1, "outputs/HMM/transition_mt_user1_top15apps", outfile_ext = 'txt')
print(accuracy1)

0.9807355516637478


### For user 2
- Note:
    - The below accuracies were obtained by running this notebook on Jan 28, 2023. 
    - Data from user 2 were collected up to Jan 27, 2023. 
    - If you run the code with a different dataset after this date, the results will be varied.

In [77]:
matrix2, accuracy2 = predict_HMM(df2, n=1, rand_state=18)
save_to_file(matrix2, "outputs/HMM/transition_mt_user2_top1app", outfile_ext = 'txt')
print(accuracy2)

0.36423841059602646


In [72]:
matrix2, accuracy2 = predict_HMM(df2, 2, 18)
print(accuracy2)

0.5761589403973509


In [74]:
matrix2, accuracy2 = predict_HMM(df2, 5, 18)
print(accuracy2)

0.804635761589404


In [75]:
matrix2, accuracy2 = predict_HMM(df2, 10, 18)
print(accuracy2)

0.9238410596026491


In [78]:
matrix2, accuracy2 = predict_HMM(df2, 15, 18)
save_to_file(matrix2, "outputs/HMM/transition_mt_user2_top15apps", outfile_ext = 'txt')
print(accuracy2)

0.9536423841059603


# Extra Credit: Emission Matrix

# Data Preprocessing for Emission Matrix

In [34]:
def get_clean_data_for_tabs(df):
    """Clean the tab names by removing PIIs"""
    # Get data and clean Missing Strings
    exes = df[df['ID_INPUT'] == 4]['VALUE'].reset_index()['VALUE']
    df = df[df['ID_INPUT'] == 3].reset_index()
    lst = df['VALUE'].apply(lambda r: "File Explorer" if (r == "Missing String." or pd.isnull(r)) else r).tolist()

    arr = []
    indx = []
    # Find list items that are associated w/ Google Chrome
    for item in lst:
        if ("Google Chrome" in item) or ("google chrome" in item):
            arr.append(item)
            indx.append(lst.index(item))
    
    twos = [] # list of 2 items in a tab name (ex: 'Online C Compiler - Google Chrome')
    threes = [] # list of 3 items in a tab name (ex: 'Process and EDA - Jupyter Notebook - Google Chrome')
    fours = [] # list of 4 items in a tab name (ex: 'Dsc 180B - Quarter 2 Week 2 - Google Slides - Google Chrome')
    fives = [] # list of 5 items in a tab name (ex: currently none)
    sixes = [] # list of 6 items in a tab name 
    # (ex: 'DSC 140A - Probabilistic Modeling and ML - LE [A00] - Course Podcasts - UC San Diego - Google Chrome')
    # Find lists of k items in tab names
    for item in arr:
        x = item.split("-")
        if len(x) == 2:
            twos.append(item)
        elif(len(x) == 3):
            threes.append(item)
        elif(len(x) == 4):
            fours.append(item)
        elif(len(x) == 5):
            fives.append(item)
        else:
            sixes.append(item)

    splits = []
    # Conduct the splits
    for item in arr:
        x = item.split('-')
        if(len(x) == 1):
            splits.append(item)
        elif(len(x) in [2,3,4]):
            splits.append(x[len(x)-2] + "-" + x[len(x)-1])
        else:
            splits.append(x[len(x)-3] + "-" + x[len(x)-2] + "-" + x[len(x)-1])
    

    changed_items = []
    # Get all the items with their processed names
    for item in splits:
        changed_items.append(item.strip())

    x = []
    count = 0
    # Apply the changes into the real dataframe column
    for item in lst:
        if ("Google Chrome" in item) or ("google chrome" in item):
            x.append(changed_items[count])
            count = count + 1
        else:
            x.append(item)
    
    df = df.assign(VALUE = x, exes = exes)

    return df

In [20]:
def get_clean_data_for_emission(df, date = "2023-01-19", activate_date = False):
    df = df.assign(date = df["MEASUREMENT_TIME"].astype(str).apply(lambda x: x[:10])) # extract only the date

    def preproc_before_emission(df, date = "2023-01-19", activate_date = False):
        """Get the series of executables and the apps"""
        if activate_date:
            tmp_df = df[df["date"] == date]
        else:
            tmp_df = df
        executables = tmp_df['exes'].reset_index()['exes']
        apps = tmp_df['VALUE'].reset_index()['VALUE']
        return (executables, apps)

    return preproc_before_emission(df, date = "2023-01-19", activate_date = False)

In [38]:
# For user 1
df1_processed = get_clean_data_for_tabs(df1)
executables1, apps1 = get_clean_data_for_emission(df1_processed, date = "2023-01-19", activate_date = False)

# For user 2
df2_processed = get_clean_data_for_tabs(df2)
executables2, apps2 = get_clean_data_for_emission(df2_processed, date = "2023-01-19", activate_date = False)

# Emission Matrix

For example:
- Emission_probabiblity = { “chrome.exe”: {“google doc”: P(“google doc” | “chrome.exe”), “google drive”: P(“google drive” | “chrome.exe”), ….} }
- where P(“google doc" | “chrome.exe”) = P(“google doc”, “chrome.exe”) / P(“chrome.exe”)

In [46]:
def find_exe_prob(executables, exe_name):
    """Find the probabilities of executable files
    Ex: P(chrome.exe) = (#chrome.exe) / (all exe's)"""
    numerator = sum(executables == exe_name)
    denominator = len(executables)
    return numerator / denominator

def find_joint_prob(executables, apps, from_exe, to_app):
    """Find the probability of the pair occurrence bw the executable file and the app
    Ex: P(A,B) = (# times we found pair A and B) / (# all entries)"""
    fromExe_indices = np.where(executables == from_exe)[0]
    toApp_indices = np.where(apps == to_app)[0]
    co_appear = len(set(fromExe_indices + 1) & set(toApp_indices)) # these are indices where "to_app" appears after "from_exe"
    return co_appear / len(executables)

def find_emission_prob(executables, apps, from_exe, to_app):
    """Find the emission probability
    P(to_app | from_exe) = P(from_exe, to_app) / P(from_exe)"""
    emission_numer = find_joint_prob(executables, apps, from_exe, to_app)
    emission_denom = find_exe_prob(executables, from_exe) 
    return emission_numer / emission_denom

def emission_dict(executables, apps):
    """Find a dictionary of emission probabilities"""
    unique_exes = executables.unique()
    unique_apps = apps.unique()
    emission_prob = {}
    for ex in unique_exes:
        emission_prob[ex] = {}
        for app in unique_apps:
            emission_prob[ex][app] = find_emission_prob(executables, apps, ex, app)
    return emission_prob

def emission_mt(executables, apps):
    """Find the emission matrix"""
    emission_prob = emission_dict(executables, apps)
    emission_matrix = pd.DataFrame.from_dict(emission_prob)
    return (emission_prob, emission_matrix.T)


# Sample Runs of Emission Matrix

### For user 1
- Note:
    - The below emission matrix was obtained by running this notebook on Jan 28, 2023. 
    - Data from user 1 were collected up to Jan 27, 2023. 
    - If you run the code with a different dataset after this date, the results will be varied.

In [79]:
emission_prob1, emission_matrix1 = emission_mt(executables1, apps1)
save_to_file(emission_matrix1, "outputs/HMM/emission_mt_user1", outfile_ext = 'txt')

In [50]:
emission_matrix1

Unnamed: 0,esrv.exe,Downloads,DSC180A_HW_Week4 - Microsoft Visual Studio,Movies & TV,File Explorer,2023 (CCG DCA UCSD-HDSI Capstone) | Microsoft Teams - Google Chrome,Volume Control,Search,Task Scheduler,Create Task,...,patch-1 - Google Chrome,How to Fix Git Error: You need to resolve your current index first - Google Chrome,● HMM.ipynb - System-Usage-Analysis - Visual Studio Code,GitHub,GitHub Docs - Google Chrome,CSE 110 Quiz 2 Flashcards | Quizlet - Google Chrome,Sticky Notes,Usage-Analysis · GitHub - Google Chrome,HMM - Copy.ipynb - Dsc180b - Visual Studio Code,Oh no! An error occurred! Please restart the sign in process from the editor. Forbidden(Parrot Os) · Issue #534 · VSCodium/vscodium - Google Chrome
VsDebugConsole.exe,0.164634,0.012195,0.036585,0.012195,0.182927,0.0,0.006098,0.0,0.012195,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
explorer.exe,0.018135,0.0,0.023316,0.020725,0.049223,0.002591,0.067358,0.012953,0.002591,0.0,...,0.0,0.0,0.0,0.0,0.0,0.005181,0.002591,0.0,0.0,0.0
devenv.exe,0.05036,0.0,0.05036,0.007194,0.100719,0.0,0.0,0.007194,0.021583,0.007194,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ApplicationFrameHost.exe,0.0,0.071429,0.02381,0.0,0.095238,0.02381,0.02381,0.095238,0.071429,0.166667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrome.exe,0.088207,0.013423,0.024928,0.000959,0.212848,0.0,0.06232,0.016299,0.000959,0.0,...,0.0,0.0,0.005753,0.0,0.0,0.0,0.0,0.0,0.002876,0.0
ShellExperienceHost.exe,0.025862,0.0,0.0,0.008621,0.025862,0.0,0.0,0.008621,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SearchApp.exe,0.0,0.0,0.0,0.058824,0.411765,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,0.0
Unable To Open Process,0.166667,0.0,0.083333,0.5,0.083333,0.0,0.0,0.041667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SnippingTool.exe,0.0,0.0,0.013514,0.006757,0.121622,0.006757,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DB Browser for SQLite.exe,0.009091,0.0,0.018182,0.0,0.054545,0.0,0.009091,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### For user 2
- Note:
    - The below emission matrix was obtained by running this notebook on Jan 28, 2023. 
    - Data from user 2 were collected up to Jan 27, 2023. 
    - If you run the code with a different dataset after this date, the results will be varied.

In [80]:
emission_prob2, emission_matrix2 = emission_mt(executables2, apps2)
save_to_file(emission_matrix2, "outputs/HMM/emission_mt_user2", outfile_ext = 'txt')

In [48]:
emission_matrix2

Unnamed: 0,esrv.exe,Foreground - Microsoft Visual Studio,Google Docs - Google Chrome,Messenger,sdk,Public -- 2022-2023 (CCG DCA UCSD-HDSI Capstone) | Microsoft Teams,Search,Administrator: Command Prompt,pip documentation v22.3.1 - Google Chrome,File Explorer,...,output - Notepad,output - Excel,Output Dataframe as txt - Google Chrome,cid=eKhV6MoJF3PMndK3BU4OsCb%2FryKtS0Lh9%2FEKwvGnZLI%3D&code_challenge=VKRS5vPD5b%2BCCJE%2B8f0Kcrufu55zd%2BIJ96BhVeC%2F1PU%3D - Google Chrome,cid=eKhV6MoJF3PMndK3BU4OsCb%2FryKtS0Lh9%2FEKwvGnZLI%3D&code_challenge=jjg8PtwtSyshg1MIfoK7pu8BIhwtDnPia1E6pVB0qQU%3D - Google Chrome,branch-thy - Google Chrome,Analysis - Google Chrome,Untitled - Notepad,Labrinth - Where The Wild Things,Labrinth - Miracle
VsDebugConsole.exe,0.220339,0.254237,0.025424,0.016949,0.008475,0.016949,0.042373,0.008475,0.0,0.084746,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
devenv.exe,0.132353,0.0,0.0,0.014706,0.0,0.029412,0.014706,0.0,0.0,0.220588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chrome.exe,0.0575,0.0425,0.0,0.155,0.0,0.0,0.0375,0.045,0.0,0.1525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Messenger.exe,0.017391,0.008696,0.008696,0.0,0.0,0.0,0.0,0.008696,0.0,0.156522,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
explorer.exe,0.134831,0.011236,0.0,0.082397,0.0,0.003745,0.029963,0.003745,0.0,0.011236,...,0.007491,0.011236,0.003745,0.0,0.0,0.0,0.0,0.003745,0.003745,0.0
Teams.exe,0.075758,0.166667,0.015152,0.166667,0.0,0.0,0.0,0.0,0.0,0.181818,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
SearchHost.exe,0.060606,0.030303,0.0,0.0,0.0,0.0,0.0,0.060606,0.0,0.212121,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Unable To Open Process,0.021739,0.0,0.0,0.0,0.0,0.0,0.021739,0.086957,0.043478,0.021739,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
python-3.11.1-amd64.exe,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ShellExperienceHost.exe,0.0,0.0,0.0,0.461538,0.0,0.0,0.0,0.0,0.0,0.153846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
