In [1]:
import numpy as np
from hmmlearn import hmm
import matplotlib.pyplot as plt
import pandas as pd
import json
import sklearn
from tqdm import tqdm
import pickle
from pyhhmm.gaussian import GaussianHMM
from pyhhmm.multinomial import MultinomialHMM
from pyhhmm.heterogeneous import HeterogeneousHMM
import pyhhmm.utils as hu
import logging
logging.basicConfig(level=logging.INFO)


# Data Load

In [5]:
logs_path = r'../data/logs_7-13_hussein.csv'
restricted_logs_path = r'../data/restricted_7-13_hussein.csv'
extented_model_path = r'../data/generated_data/extended_states_7-13.pkl'

In [6]:
df_logs = pd.read_csv (logs_path)
# convert json into dict
df_logs.PropertiesJson = [json.loads(df_logs.PropertiesJson[i]) for i in range(len(df_logs.PropertiesJson))]
df_logs.MeasurementsJson = [json.loads(df_logs.MeasurementsJson[i]) for i in range(len(df_logs.MeasurementsJson))]
# mark new sessions based on timestamp difference of more than 30 minutes
timestamps = df_logs.TimeGenerated.to_numpy()
timestamps_datetime = [pd.to_datetime(timestamps[i]) for i in range(len(timestamps))]
#  difference between timestamps
timestamps_diff = [timestamps_datetime[i+1] - timestamps_datetime[i] for i in range(len(timestamps_datetime)-1)]
#  convert to minutes
timestamps_diff_min = [timestamps_diff[i].total_seconds()/60 for i in range(len(timestamps_diff))]
# check if consecutive differences are bigger than 30 minutes 
timestamps_diff_min_consecutive = [1] + [1 if timestamps_diff_min[i-1] > 30 else 0 for i in range(1,len(timestamps_diff_min)+1)] 
# create array where if timestamps_diff_min_consecutive is 1 you add 1 to the previous value
new_session_counter = [0] * len(timestamps_diff_min_consecutive)
for i in range(1, len(timestamps_diff_min_consecutive)):
    new_session_counter[i] = new_session_counter[i-1] + timestamps_diff_min_consecutive[i]


df_logs['newSession'] = timestamps_diff_min_consecutive
df_logs['SessionID'] = new_session_counter
df_logs.head()


Unnamed: 0,TimeGenerated,TrackingId,CompletionId,Name,PropertiesJson,MeasurementsJson,SessionId,UserId,newSession,SessionID
0,2022-06-06T21:28:25.007Z,d41177694f19740b031bca875fe75c35,cmpl-5GEYpSBgkSgGU5U1KS8R14KDrKUpG,copilot/ghostText.shown,{'VSCode.ABExp.Features': 'livesharecontinuous...,"{'compCharLen': 218.0, 'confidence': 0.6814250...",f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...,1,0
1,2022-06-06T21:28:28.391Z,d41177694f19740b031bca875fe75c35,cmpl-5GEYpSBgkSgGU5U1KS8R14KDrKUpG,copilot/ghostText.accepted,{'VSCode.ABExp.Features': 'livesharecontinuous...,"{'compCharLen': 227.0, 'confidence': 0.6814250...",f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...,0,0
2,2022-06-06T21:28:32.617Z,d41177694f19740b031bca875fe75c35,cmpl-5GEYxVR6f21oGQK3JlcMxmAsxvZ9m,copilot/ghostText.shown,{'VSCode.ABExp.Features': 'livesharecontinuous...,"{'compCharLen': 59.0, 'confidence': 0.82290775...",f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...,0,0
3,2022-06-06T21:28:34.047Z,d41177694f19740b031bca875fe75c35,cmpl-5GEYxVR6f21oGQK3JlcMxmAsxvZ9m,copilot/ghostText.rejected,{'VSCode.ABExp.Features': 'livesharecontinuous...,"{'compCharLen': 59.0, 'confidence': 0.82290775...",f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...,0,0
4,2022-06-06T21:28:34.05Z,d41177694f19740b031bca875fe75c35,cmpl-5GEYzV048YVHP9214bqg9lgOUn4uw,copilot/ghostText.shown,{'VSCode.ABExp.Features': 'livesharecontinuous...,"{'compCharLen': 42.0, 'confidence': 0.87763204...",f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...,0,0


In [7]:
df_logs_restr = pd.read_csv(restricted_logs_path)
df_logs_restr.PropertiesJson = [json.loads(df_logs_restr.PropertiesJson[i]) for i in range(len(df_logs_restr.PropertiesJson))]
df_logs_restr.MeasurementsJson = [json.loads(df_logs_restr.MeasurementsJson[i]) for i in range(len(df_logs_restr.MeasurementsJson))]
df_logs_restr.head()

Unnamed: 0,TimeGenerated,TrackingId,CompletionId,Name,PropertiesJson,MeasurementsJson,SessionId,UserId
0,2022-06-06T21:28:24.278Z,d41177694f19740b031bca875fe75c35,,copilot/engine.prompt,{'VSCode.ABExp.Features': 'livesharecontinuous...,"{'promptCharLen': 52.0, 'timeSinceIssuedMs': 2...",f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...
1,2022-06-06T21:28:25Z,d41177694f19740b031bca875fe75c35,cmpl-5GEYpSBgkSgGU5U1KS8R14KDrKUpG,copilot/engine.completion,{'VSCode.ABExp.Features': 'livesharecontinuous...,{'timeSinceIssuedMs': 2.0},f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...
2,2022-06-06T21:28:25.003Z,d41177694f19740b031bca875fe75c35,cmpl-5GEYpSBgkSgGU5U1KS8R14KDrKUpG,copilot/engine.completion,{'VSCode.ABExp.Features': 'livesharecontinuous...,{'timeSinceIssuedMs': 1.0},f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...
3,2022-06-06T21:28:25.004Z,d41177694f19740b031bca875fe75c35,cmpl-5GEYpSBgkSgGU5U1KS8R14KDrKUpG,copilot/engine.completion,{'VSCode.ABExp.Features': 'livesharecontinuous...,{'timeSinceIssuedMs': 0.0},f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...
4,2022-06-06T21:28:28.785Z,d41177694f19740b031bca875fe75c35,,copilot/engine.prompt,{'VSCode.ABExp.Features': 'livesharecontinuous...,"{'promptCharLen': 270.0, 'timeSinceIssuedMs': ...",f89b2779-df37-46e7-a71e-6c28968064ea1654550891557,078a6302949265084294652f07e8300f77b3dfa6784901...


#  Observable State  Model

![dag {
bb="0,0,1,1"
"Browse Suggestions" [pos="0.840,0.616"]
"Editing Suggestion" [latent,pos="0.177,0.099"]
"Ghost Text Appears" [pos="0.656,0.176"]
"User Accepts Ghost Text" [outcome,pos="0.491,0.684"]
"User Before Action" [pos="0.644,0.393"]
"User Rejects Ghost Text" [outcome,pos="0.669,0.705"]
"User Typing/Paused" [pos="0.183,0.501"]
"Writing New Code" [latent,pos="0.326,0.194"]
Paused [pos="0.289,0.323"]
Typing [pos="0.131,0.309"]
"Browse Suggestions" -> "Ghost Text Appears"
"Editing Suggestion" -> "Ghost Text Appears"
"Ghost Text Appears" -> "User Before Action"
"User Accepts Ghost Text" -> "User Typing/Paused"
"User Before Action" -> "Browse Suggestions"
"User Before Action" -> "User Accepts Ghost Text"
"User Before Action" -> "User Rejects Ghost Text"
"User Rejects Ghost Text" -> "User Typing/Paused"
"User Typing/Paused" -> Paused
"User Typing/Paused" -> Typing
"Writing New Code" -> "Ghost Text Appears"
Paused -> "Ghost Text Appears"
Typing -> "Editing Suggestion"
Typing -> "Writing New Code"
}
](images/state_diagram_1.png)  


In [8]:
# create observable state diagram
column_names = ["TimeGenerated", "CompletionId", "UserId", "SessionId","StateName", "HiddenState","TimeSpentInState","CurrentSuggestion", "CurrentPrompt", "Measurements"]
unique_session_ids = np.unique(df_logs.SessionID) # or SessionId  
df_observations_states = []
import copy
for id_unique in tqdm(unique_session_ids):
    # split on SessionID
    df_task = df_logs[df_logs.SessionID == id_unique]
    df_names = df_task.Name.to_numpy()
    df_states = pd.DataFrame(columns= column_names)
    in_Shown = False
    last_completion_id = 0 
    # iterate through data frame
    last_timestamp = -1
    for index, row in df_task.iterrows():
        # ignore stillInCodes for now
        name = row['Name']
        completion_id = row['CompletionId']
        if name == 'copilot/ghostText.stillInCode':
            continue
        choiceIndex = row.PropertiesJson['choiceIndex']
        df_logs_restr_completion = df_logs_restr[df_logs_restr.CompletionId == completion_id]
        completion = ""
        prompt = ""
        # iterate through restricted data frame to get prompt and completion
        for index_restr, row_restr in df_logs_restr_completion.iterrows():
            if 'completionTextJson' in row_restr.PropertiesJson and 'choiceIndex' in row_restr.PropertiesJson:
                if row_restr.PropertiesJson['choiceIndex'] == choiceIndex:
                    completion = row_restr.PropertiesJson['completionTextJson']
            if 'hypotheticalPromptJson' in row_restr.PropertiesJson:
                prompt = row_restr.PropertiesJson['hypotheticalPromptJson']

        # find prompt in completion 
        index_restr = df_logs_restr_completion.index[0]
        df_prompt_compl = df_logs_restr.loc[index_restr-1]
        if 'promptJson' in df_prompt_compl.PropertiesJson:
            prompt = df_prompt_compl.PropertiesJson['promptJson']

        user_id = row['UserId']
        session_id = row['SessionId']


        if last_timestamp != -1:
            # difference
            time_difference =  pd.to_datetime(row['TimeGenerated']) - pd.to_datetime(last_timestamp) 
            time_difference_sec = time_difference.total_seconds()
        else:
            last_timestamp = row['TimeGenerated']
            time_difference_sec = 0 
        

        state_dict = {  'TimeGenerated': row['TimeGenerated'],
                        'CompletionId': completion_id,
                        'UserId': user_id,
                        'SessionId': session_id,
                        'StateName': "TBD",
                        'HiddenState': "TBD",
                        'TimeSpentInState': time_difference_sec,
                        'CurrentSuggestion': completion,
                        'CurrentPrompt': prompt,
                        'Measurements': [row['MeasurementsJson']]} # might add more properties later
        
        # just keep track if in shown state
        if name != 'copilot/ghostText.shown' and name != 'copilot/ghostText.shownFromCache':
            in_Shown = False
            last_completion_id = completion_id

        # all states tracking
        if name == 'copilot/ghostText.shown' or name == 'copilot/ghostText.shownFromCache':
            # if previous state was also shown
            if in_Shown:
                # if we are in shown and the completion id is the same as the last one, user is browsing suggestions
                if last_completion_id == completion_id:
                    new_state = copy.deepcopy(state_dict)
                    new_state['StateName'] = 'Browsing'
                    new_state['HiddenState'] = 'UserBeforeAction'

                    df_states = pd.concat([df_states, pd.DataFrame(new_state)])

                # if we were previously in shown, and now we have a new shown, then we previously rejected the shown
                # typed, and then got another suggestion
                else:
                    new_state = copy.deepcopy(state_dict)
                    new_state['StateName'] = 'Shown' # hidden rejected
                    new_state['HiddenState'] = 'UserTypingOrPaused'

                    df_states = pd.concat([df_states, pd.DataFrame(new_state)])
            else:
                # if previous was not shown, then user was typing/paused
                new_state = copy.deepcopy(state_dict)
                new_state['StateName'] = 'Shown'
                new_state['HiddenState'] = 'UserTypingOrPaused'
                
                df_states = pd.concat([df_states, pd.DataFrame(new_state)])          
            last_completion_id = completion_id
            in_Shown = True

        elif name == 'copilot/ghostText.accepted':
            # before accepting, user was thinking
            new_state = copy.deepcopy(state_dict)
            new_state['StateName'] = 'Accepted'
            new_state['HiddenState'] = 'UserBeforeAction'
            df_states = pd.concat([df_states, pd.DataFrame(new_state)])
        elif name == 'copilot/ghostText.rejected':
            new_state = copy.deepcopy(state_dict)
            new_state['StateName'] = 'Rejected'
            new_state['HiddenState'] = 'UserBeforeAction'
            df_states = pd.concat([df_states, pd.DataFrame(new_state)])
        last_timestamp = row['TimeGenerated']

    df_observations_states.append(df_states)
    
# decipher paused vs typing
for session_id in range(len(df_observations_states)):
    # iterate through dataframe and modify statename
    for index in range(1, len(df_observations_states[session_id])):
        if df_observations_states[session_id].iloc[index]['StateName'] == 'Shown':
            if df_observations_states[session_id].iloc[index-1]['StateName'] == 'Accepted':
                code_lenght = df_observations_states[session_id].iloc[index]['Measurements']['documentLength']
                index2 = index -1
                code_lenght2 = df_observations_states[session_id].iloc[index2]['Measurements']['documentLength']
                suggestion_lenght = df_observations_states[session_id].iloc[index2]['Measurements']['compCharLen']
                if abs(code_lenght2 + suggestion_lenght - code_lenght)<=3:
                    df_observations_states[session_id].iloc[index,df_observations_states[session_id].columns.get_loc('HiddenState')]= 'UserPaused'
                    break
                else:
                    df_observations_states[session_id].iloc[index,df_observations_states[session_id].columns.get_loc('HiddenState')]= 'UserTyping'
                    break

# create observable state diagram
column_names = ["TimeGenerated", "CompletionId", "UserId", "SessionId","StateName", "HiddenState", "TimeSpentInState","CurrentSuggestion", "CurrentPrompt", "Measurements"]
unique_session_ids = np.unique(df_logs.SessionID) # or SessionId  

index_session = 0
for id_unique in tqdm(unique_session_ids):
    # split on SessionID
    df_task = df_logs[df_logs.SessionID == id_unique]        
    # get all completion ids for df_task
    completion_ids = df_task.CompletionId.to_numpy()
    completion_ids = np.unique(completion_ids)
    for completion_id in completion_ids:
        df_completion = df_task[df_task.CompletionId == completion_id]
        # iterate through df_completion
        last_timeout = 0
        for index, row in df_completion.iterrows():
            name = row['Name']
            completion_id = row['CompletionId']
            if name != 'copilot/ghostText.stillInCode':
                continue
            timeout_time = row['MeasurementsJson']['timeout']
            # Only do it for small timeouts, otherwise too imprecise
            if timeout_time  > 30:
                continue
            timestamp = row['TimeGenerated']
            got_edited = False
            # TO-DO: EDITING THRESHOLD FIX
            if row['MeasurementsJson']['relativeLexEditDistance'] > 0.15:
                got_edited = True
            end_time = pd.to_datetime(timestamp)
            # add to start time the timeout time
            start_time = end_time - pd.Timedelta(seconds=timeout_time) + pd.Timedelta(seconds=last_timeout)
            # iterate through df_observations_states[index_session]
            for index2 in range(len(df_observations_states[index_session])):
                if df_observations_states[index_session].iloc[index2]['HiddenState'] == 'UserTyping':
                    difference_start = pd.to_datetime(df_observations_states[index_session].iloc[index2]['TimeGenerated']) - start_time
                    difference_end = pd.to_datetime(df_observations_states[index_session].iloc[index2]['TimeGenerated']) - end_time
                    difference_start = difference_start.total_seconds()
                    difference_end = difference_end.total_seconds()
                    if  difference_start >= 0 and difference_end <= 0:
                        df_observations_states[index_session].iloc[index2,df_observations_states[index_session].columns.get_loc('HiddenState')] = 'EditingSuggestions'

            # if suggestion got edited, assume suggestion won't get edited again, so stop assigning attribution for this suggestion. 
            if got_edited:
                break
            last_timeout = timeout_time
    index_session += 1

# add editDistance
for session_id in range(len(df_observations_states)):
    EditPercentage = []
    # iterate through dataframe and modify statename
    for index in range(len(df_observations_states[session_id])):
        # get completion id
        completion_id = df_observations_states[session_id].iloc[index]['CompletionId']
        # get all logs in df_logs that have matching completion_id
        df_task_completion = df_logs[df_logs.CompletionId == completion_id]
        df_task_completion  = df_task_completion[df_task_completion.Name == 'copilot/ghostText.stillInCode']
        edit_distance = {'charEditDistance': -1, 'lexEditDistance': -1, 'stillInCodeHeuristic': -1, 'relativeLexEditDistance': 1}
        # iterate through df_task_completion
        for index2 in range(len(df_task_completion)):
            # get measurements
            edit_distance['charEditDistance'] = df_task_completion.iloc[index2]['MeasurementsJson']['charEditDistance']
            edit_distance['lexEditDistance'] = df_task_completion.iloc[index2]['MeasurementsJson']['lexEditDistance']
            edit_distance['stillInCodeHeuristic'] = df_task_completion.iloc[index2]['MeasurementsJson']['stillInCodeHeuristic']
            edit_distance['relativeLexEditDistance'] = df_task_completion.iloc[index2]['MeasurementsJson']['relativeLexEditDistance']
            if df_task_completion.iloc[index2]['MeasurementsJson']['timeout'] == 600.0:
                break

        EditPercentage.append(edit_distance)
    df_observations_states[session_id]['EditPercentage'] = EditPercentage

# Simple heuristic for prompt crafting, check if last prompt was a comment, user looked at suggestion and  accepted or browsed
import nltk
for session_id in range(len(df_observations_states)):
    # iterate through dataframe and modify statename
    for index in range(len(df_observations_states[session_id])):
        if df_observations_states[session_id].iloc[index]['HiddenState'] in ['EditingSuggestions', 'UserTyping']:
            code_lenght = df_observations_states[session_id].iloc[index]['Measurements']['documentLength']
            # do a backward search to find the previous typing state
            prompt = df_observations_states[session_id].iloc[index]['CurrentPrompt']
            # split prompt on //
            prompt_split = prompt.split('//')
            # get the last part of the prompt
            prompt_last = prompt_split[-1]
            # check if prompt_last contains #
            is_a_comment = False
            if '#' in prompt_last:
                is_a_comment = True
                #df_observations_states[session_id].iloc[index,df_observations_states[session_id].columns.get_loc('StateName')]= 'PromptCrafting'
            try:
                prev_prompt = df_observations_states[session_id].iloc[index-1]['CurrentPrompt']
            except:
                continue
            #prev_prompt_dist = nltk.edit_distance(prompt, prev_prompt)
            if prev_prompt == prompt:
                continue
            user_spent_time_looking = df_observations_states[session_id].iloc[index]['TimeSpentInState']
            # in two states
            try:
                user_action = df_observations_states[session_id].iloc[index+3]["StateName"]
            except:
                user_action = 'Undefined'
            # edit the time 
            if is_a_comment and user_spent_time_looking > 0.3 and user_action in ['Accepted', 'Browsing']:
                df_observations_states[session_id].iloc[index,df_observations_states[session_id].columns.get_loc('HiddenState')] = 'PromptCrafting'



100%|██████████| 63/63 [00:25<00:00,  2.52it/s]
100%|██████████| 63/63 [01:08<00:00,  1.08s/it]


In [11]:
print([df_observations_states[7].Measurements.iloc[i]['documentLength'] for i in range(len(df_observations_states[6]))])

[41.0, 41.0, 41.0, 41.0, 83.0, 83.0, 88.0, 88.0, 284.0, 284.0, 311.0, 311.0, 300.0, 300.0, 27.0, 27.0, 27.0, 37.0, 37.0, 37.0, 27.0, 27.0, 288.0, 288.0, 461.0, 461.0, 510.0, 510.0, 560.0, 560.0]


In [None]:
for i in range(len(df_observations_states[6])):

In [92]:
pickle.dump(df_observations_states, open(extented_model_path, 'wb'))

## Unobservable States: User Before Action

![dag {
bb="0,0,1,1"
"Browse Suggestions" [pos="0.874,0.794"]
"Deferring thought" [latent,pos="0.851,0.401"]
"Editing Suggestion" [latent,pos="0.177,0.099"]
"Ghost Text Appears" [pos="0.656,0.176"]
"Thinking about suggestion" [latent,pos="0.838,0.263"]
"User Accepts Ghost Text" [outcome,pos="0.434,0.742"]
"User Before Action" [pos="0.608,0.433"]
"User Rejects Ghost Text" [outcome,pos="0.642,0.805"]
"User Typing/Paused" [pos="0.184,0.632"]
"Writing New Code" [latent,pos="0.326,0.194"]
"no thought" [latent,pos="0.848,0.567"]
Paused [pos="0.318,0.405"]
Typing [pos="0.124,0.405"]
"Browse Suggestions" -> "Ghost Text Appears"
"Deferring thought" <-> "User Before Action"
"Editing Suggestion" -> "Ghost Text Appears"
"Ghost Text Appears" -> "User Before Action"
"Thinking about suggestion" <-> "User Before Action"
"User Accepts Ghost Text" -> "User Typing/Paused"
"User Before Action" -> "Browse Suggestions" [pos="0.715,0.611"]
"User Before Action" -> "User Accepts Ghost Text"
"User Before Action" -> "User Rejects Ghost Text"
"User Before Action" <-> "no thought"
"User Rejects Ghost Text" -> "User Typing/Paused"
"User Typing/Paused" -> Paused
"User Typing/Paused" -> Typing
"Writing New Code" -> "Ghost Text Appears"
Paused -> "Ghost Text Appears"
Typing -> "Editing Suggestion"
Typing -> "Writing New Code"
}
](images/dagitty-model.png)

We will decode the User Before Action state. 
Our strategy will be to define heuristics for decoding for a couple of states. 

In [9]:
# we will define one step heuristics first
# we define heuristics as a function of a span of logs, or a single entry
class Heuristics:
    def __init__(self, observed_states = None, unobserved_decoding = None):
        self.observed_states = 'UserBeforeAction'
        self.unobserved_decoding = ['NoThought', 'UserThinkingSuggestion', 'DeferringThought']

    def label_state(self, logs_session, index):
        # function documentation input or output
        # logs_session: dataframe of logs for a single session
        # index: index of the current state
        # return: label of the state, or -1 if no heuristic applies

        if logs_session.iloc[index]['HiddenState'] not in self.observed_states :
             return -1
        # call all functions in this class 
        # if any of the functions return a label, return that label
        for function in dir(self):
            if function.startswith('heuristic'):
                label = getattr(self, function)(logs_session, index)
                if label != -1:
                    return label
        # if no heuristic applies, return -1
        return -1
        
    
    def heuristic01_uba(self, logs_session, index):
        try:
            row = logs_session.iloc[index]
            #next_row = logs_session.iloc[index+1]
            # no thought: single line rejected 
            if row["TimeSpentInState"] <= 0.01 and \
            row["Measurements"]["compCharLen"] <= 20 and \
            row["Measurements"]["numLines"] == 1 and \
            row["StateName"] in ['Rejected','Browsing']:
                return self.unobserved_decoding[0]
            else:
                return -1
        except:
            return -1 
    def heuristic02_uba(self, logs_session, index):
        try:
            row = logs_session.iloc[index]
            #next_row = logs_session.iloc[index+1]
            # no thought: multi line accepted in little time and not edited later
            if row["TimeSpentInState"] <= 0.3 and \
            row["Measurements"]["numLines"] > 2 and \
            row["StateName"] == 'Accepted' and \
            row['EditPercentage']['relativeLexEditDistance'] <= 0.1:          
                return self.unobserved_decoding[0]
            else:
                return -1
        except:
            return -1 

    
    def heuristic11_uba(self, logs_session, index):
        # thinking about suggestion, significant time spent looking, and suggestion was not edited in the future and accepted.
        try:
            row = logs_session.iloc[index]
            next_row = logs_session.iloc[index+1]   
            if row["TimeSpentInState"] >= 5 and \
                row["StateName"] == 'Accepted' and \
                row['EditPercentage']['relativeLexEditDistance']  <= 0.1:          
                return self.unobserved_decoding[1]
            else:
                return -1
        except:
            return -1 
    def heuristic12_uba(self, logs_session, index):
        # thinking about suggestion, significant time spent looking, then rejected, and then typed next state
        row = logs_session.iloc[index]
        try:
            next_row = logs_session.iloc[index+1] 
            #next_next_row = logs_session.iloc[index+2]
            if row["TimeSpentInState"] >= 5 and \
                row["StateName"] == 'Rejected' and \
                next_row["HiddenState"] in ['EditingSuggestions', 'UserTyping']:
                return self.unobserved_decoding[1]
            else:
                return -1
        except:
            return -1 
    def heuristic21_uba(self, logs_session, index):
        # defer thought for later: 2 or more previous actions were accepts, all single line, all accepts
        ''' 
        Sample log behavior
        UserBeforeAction', 'Accepted', 'UserPaused', 'Shown',
       'UserBeforeAction', 'Accepted', 'UserPaused', 'Shown',
       'UserBeforeAction', 'Accepted', 'UserPaused', 'Shown',
        ''' 
        try:
            prev_rows_states = logs_session.iloc[index-4:index].StateName.to_numpy() # -3 and -7 should be accepts
            prev_rows_hidden_states = logs_session.iloc[index-4:index].HiddenState.to_numpy() # -3 and -7 should be accepts

            row = logs_session.iloc[index]
            next_row = logs_session.iloc[index+1]
            prev_state_condition =  ['Accepted', 'Shown', 'Accepted', 'Shown']
            prev_hidden_state_condition =  ['UserBeforeAction', 'UserPaused', 'UserBeforeAction', 'UserPaused']
            prev_hidden_state_check = True
            prev_state_check = True
            # enumerate through prev_rows_states
            counter = 0
            for i in range(len(prev_state_condition)):
                if prev_rows_states[i] != prev_state_condition[counter]:
                    prev_state_check = False
                    break
                if prev_hidden_state_condition[i] == 'UserPaused' and prev_rows_hidden_states[i] != prev_hidden_state_condition[counter]:
                    prev_hidden_state_check = False
                    break
                counter += 1
            if row["TimeSpentInState"] <= 1 and \
            row["Measurements"]["numLines"] == 1 and \
            row["StateName"] == 'Accepted' and \
            prev_state_check and prev_hidden_state_check:
                return self.unobserved_decoding[2]
            else:
                return -1
        except:
            return -1

    def heuristic22_uba(self, logs_session, index):
        # defer thought for later: 1 big multi line suggestion that got edited signifcantly later
        try:
            row = logs_session.iloc[index]
            #next_row = logs_session.iloc[index+1]
            # no thought: multi line accepted in little time and not edited later
            if row["TimeSpentInState"] <= 0.3 and \
            row["Measurements"]["numLines"] > 2 and \
            row["StateName"] == 'Accepted' and \
            row['EditPercentage']['relativeLexEditDistance'] > 0.3:          
                return self.unobserved_decoding[2]
            else:
                return -1
        except:
            return -1
            
# Apply Heuristics to the dataframe
heuristics_uba = Heuristics()
heuristic_coverage = 0
heuristic_could_apply = 0
for session_id in range(len(df_observations_states)):
    # iterate through dataframe and modify statename
    for index in range(len(df_observations_states[session_id])):
        heuristic_label = heuristics_uba.label_state(df_observations_states[session_id], index)
        if heuristic_label != -1:
            df_observations_states[session_id].iloc[index,df_observations_states[session_id].columns.get_loc('HiddenState')] = heuristic_label
            heuristic_coverage += 1
        # check if state name is userbeforeaction
        if df_observations_states[session_id].iloc[index]['HiddenState'] == 'UserBeforeAction':
            heuristic_could_apply += 1
print('Heuristic coverage: ', heuristic_coverage/heuristic_could_apply*100)
# 15 percent coverage

Heuristic coverage:  24.818366753850622


# HMM model

## Learning

In [275]:
observed_states = 'UserBeforeAction'
unobserved_decoding = ['NoThought', 'UserThinkingSuggestion', 'DeferringThought']
state_encoding_dict = {
    'Accepted': 1, # first action after UBA
    'Rejected': 0,
    'Browsing':  2,
    'EditingSuggestions': 1,
    'UserTyping': 1, # second action after UBA
    'PromptCrafting': 1,
    'Shown' : 2,
    'UserPaused': 0,
    'UserTypingOrPaused': 0
}
heuristic_encoding_dict = {
    'NoThought': 0,
    'UserThinkingSuggestion': 1,
    'DeferringThought': 2
}
observation_sequences = []
observation_feature_names = [ 'charLen', 'confidence', 'numLines',  'timeSpentInState', 'relativeLexEditDistance', 'next action', 'next-next action', 'heuristic']

def get_observation_sequence(logs_session):
    session_observations = []
    for index in range(len(logs_session)):
        row = logs_session.iloc[index]
        if logs_session.iloc[index]['StateName'] == observed_states or \
           logs_session.iloc[index]['StateName'] in unobserved_decoding:
            # collect all features for the observation
            observation = []
            # add next two states
            state_names = []
            try:
                state_names.append(logs_session.iloc[index+1]['StateName'])
                state_names.append(logs_session.iloc[index+2]['StateName'])
            except:
                state_names = []
                state_names.append('NAN')
                state_names.append('NAN')
            # encode the state names according to state_encoding_dict
            state_names_encoded = []
            for state_name in state_names:
                try:
                    state_names_encoded.append(state_encoding_dict[state_name])
                except:
                    state_names_encoded.append(0)
            # add suggestion features
            try:
                suggestion_features = [row['Measurements']['compCharLen'],
                                    row['Measurements']['confidence'],
                                    row['Measurements']['numLines']]
            except:
                suggestion_features = [0, 0, 0]
            time_spent_in_state = row['TimeSpentInState']
            edit_distance = max(row['EditPercentage']['relativeLexEditDistance'],0)
            # add heuristic
            heuristic = np.nan
            if row['StateName'] in unobserved_decoding:
                heuristic =  heuristic_encoding_dict[row['StateName']]
            observation.extend(suggestion_features)
            observation.append(time_spent_in_state)
            observation.append(edit_distance)
            observation.extend(state_names_encoded)
            observation.append(heuristic)
            # make observation into numeric np array
            observation = np.array(observation, dtype=np.float32)
        
            session_observations.append(observation)
    return session_observations

for session_id in range(len(df_observations_states)):
    # iterate through dataframe and modify statename
    session_observations = get_observation_sequence(df_observations_states[session_id])
    if len(session_observations) > 1:
        observation_sequences.append(np.array(session_observations, dtype=np.float32))

In [None]:
# instantiate a HeterogeneousHMM object
my_hmm = HeterogeneousHMM(
        n_states=3,
        n_g_emissions=5,
        n_d_emissions=3,
        n_d_features=[3, 3, 3],
        covariance_type='full',
        nr_no_train_de=1,
        verbose=True
    )

my_hmm.B =  [1/3*np.ones((3,3)), 1/3*np.ones((3,3)),  0.8*np.eye(3) + 0.1*np.ones((3,3))]

# train the model to estimate the parameters
my_hmm, log_likelihood = my_hmm.train(
    observation_sequences, n_init=1, n_iter=100, conv_thresh=0.001, conv_iter=5, plot_log_likelihood=True,
)

# print model parameters
hu.pretty_print_hmm(my_hmm, hmm_type='Heterogeneous')

In [312]:
# decode a training sequences
index_session = 31
logL, state_seq = my_hmm.decode([observation_sequences[index_session]], algorithm='viterbi')
state_names = ['DeferringThought', 'UserThinkingSuggestion', 'NoThought']
hu.plot_decode(
    observation_sequences[index_session], 
    observation_feature_names, 
    state_seq[0], 
    discrete_columns=[ 'next action',' next-next action'], 
    #time_stamps=df[df['seq_no'] == 0]['date'],
    figsize=(15, 10),
    state_names = state_names,
    filename = 'plots_hmm.pdf'
)

In [313]:
# now we transform back the df_observation_states from the deocding
for session_id in range(len(df_observations_states)):
    # iterate through dataframe and modify statename
    session_observations = get_observation_sequence(df_observations_states[session_id])
    if len(session_observations) < 1:
        continue
    # decode session
    logL, state_seq = my_hmm.decode([session_observations], algorithm='viterbi')
    # transform back the state_seq to the original state names
    state_seq_names = [state_names[i] for i in state_seq[0]]
    # iterate through dataframe
    counter = 0
    for index in range(len(df_observations_states[session_id])):
        row = df_observations_states[session_id].iloc[index]
        if row['StateName'] == observed_states or \
           row['StateName'] in unobserved_decoding:
            df_observations_states[session_id].iloc[index,df_observations_states[session_id].columns.get_loc('StateName')] = state_seq_names[counter]
            counter += 1

In [278]:
pickle.dump(df_observations_states, open(extented_model_path, 'wb'))

In [266]:
# https://huggingface.co/microsoft/codebert-base
# can get code embeddings cheaply
#from sentence_transformers import SentenceTransformer
#sentences = ["This is an example sentence", "Each sentence is converted"]

#model = SentenceTransformer('flax-sentence-embeddings/st-codesearch-distilroberta-base')
#embeddings = model.encode(sentences)
#print(embeddings.size)


## extract for video

In [314]:
extented_model_path = 'data/generated_data/df_obs_states_7-03.pkl'

In [None]:
df_observations_states[37]

In [315]:
#item = 'mean = sum(array) / len(array)'
item = "import numpy as np"
#item = "model"

array = df_observations_states[37].CurrentSuggestion.to_numpy()
# check if item is substring of array element
first_index = 0
for i in range(len(array)):
    if item in array[i]:
        print(array[i])
        first_index = i
        break
print(first_index)


"import numpy as np"
0


In [316]:
df_video_states =df_observations_states[37][first_index:]


In [317]:
pickle.dump(df_video_states, open(extented_model_path, 'wb'))

## Visualization

In [None]:
# encode each unique string in state_names as a number
# import labelencoder
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
encoder.fit(state_names)
encoded_state_names = encoder.transform(state_names)


In [None]:
encoded_state_names

In [None]:
model = hmm.MultinomialHMM(8, verbose=True)
model.fit(encoded_state_names.reshape(1, -1))
model.score(encoded_state_names.reshape(1, -1))
model.decode(encoded_state_names.reshape(1, -1))


In [None]:
from hmmviz import TransGraph
import pandas as pd
import matplotlib.pyplot as plt


T = pd.crosstab(
    pd.Series(state_names[:-1], name='Today'),
    pd.Series(state_names[1:], name='Tomorrow'),
    normalize=0
)


graph = TransGraph(T)

# looks best on square figures/axes
fig = plt.figure(figsize=(20, 20))

graph.draw()
plt.savefig('hmm_graph.pdf')
plt.show()
# save plot as pdf


In [None]:
from hmmviz import TransGraph
import pandas as pd
import matplotlib.pyplot as plt


T = pd.crosstab(
    pd.Series(state_names[:-1], name='Today'),
    pd.Series(state_names[1:], name='Tomorrow'),
    normalize=0
)


graph = TransGraph(T)

# looks best on square figures/axes
fig = plt.figure(figsize=(20, 20))

graph.draw()
plt.savefig('hmm_graph.pdf')
plt.show()
# save plot as pdf
