In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from termcolor import *

In [2]:
def preprocess(df):
    # Drop temp_R, R_precision and matching_score_pred columns
    df = df.drop(columns=['temp_R', 'R_precision', 'matching_score_pred'])

    # Remove first 4 characters of token column in dataframe (removes sos/ from token string)
    df['token'] = df['token'].str[4:]

    # Replace every '/' with ' ' in token column
    df['token'] = df['token'].str.replace('/', ' ')

    # Remove all words that end with a '_' in token column
    df['token'] = df['token'].str.replace(r'\w*_', '', regex=True).str.strip()

    # Remove everything after and including eos (probably stands for end of sentence) in token column
    df['token'] = df['token'].str.split('eos').str[0]

    return df

# --- Load data ---
# Load altered.log and original.log, split with ; into dataframes
original = pd.read_csv('original.log', sep=';')
altered = pd.read_csv('altered.log', sep=';')

# --- Preprocess data ---
original = preprocess(original)
altered = preprocess(altered)

# Add suffix to columns
original = original.add_suffix('_original')
altered = altered.add_suffix('_altered')

# Fuse the two dataframes together
fused = pd.concat([original, altered], axis=1)

# Save fused dataframe to csv
fused.to_csv('fused.csv', index=False)

In [3]:
fused[['token_original', 'token_altered']]

Unnamed: 0,token_original,token_altered
0,a man jump and bring both arm above his head a...,a man step forward hold an object to his head ...
1,a man hold his arm out in front of him in a sh...,a man crouch and punch the air with his left h...
2,a person hold their hand in front of them turn...,a person run forward and to his left and jump ...
3,a person clasp his hand in front of him then s...,person walk quickly down a short incline They ...
4,a person reach forward with the right hand and...,the man is march like a soldier He do so by li...
...,...,...
4635,pace back and forth from right to left,a person take several step in a somewhat rando...
4636,a man sit down before rub his leg,a person face forward take a large step to the...
4637,a person slowly walk without move forward,a person sit down and talk with their hand The...
4638,person is walk on uneven terrain,move forward and take some thing go back after...


In [4]:
altered

Unnamed: 0,temp_match_altered,token_altered
0,3.061639,a man step forward hold an object to his head ...
1,2.776492,a man crouch and punch the air with his left h...
2,1.808012,a person run forward and to his left and jump ...
3,3.298793,person walk quickly down a short incline They ...
4,2.006741,the man is march like a soldier He do so by li...
...,...,...
4635,8.384559,a person take several step in a somewhat rando...
4636,3.714846,a person face forward take a large step to the...
4637,2.007957,a person sit down and talk with their hand The...
4638,5.977048,move forward and take some thing go back after...


In [8]:
# Old version based on actually joining

# def preprocess(df):
#     # Drop temp_R, R_precision and matching_score_pred columns
#     df = df.drop(columns=['temp_R', 'R_precision', 'matching_score_pred'])

#     # Remove first 4 characters of token column in dataframe (removes sos/ from token string)
#     df['token'] = df['token'].str[4:]

#     # Replace every '/' with ' ' in token column
#     df['token'] = df['token'].str.replace('/', ' ')

#     # Remove all words that end with a '_' in token column
#     df['token'] = df['token'].str.replace(r'\w*_', '', regex=True).str.strip()

#     # Remove everything after and including eos (probably stands for end of sentence) in token column
#     df['token'] = df['token'].str.split('eos').str[0]

#     return df

# # --- Load data ---
# # Load altered.log and original.log, split with ; into dataframes
# original = pd.read_csv('original.log', sep=';')
# altered = pd.read_csv('altered.log', sep=';')

# # --- Preprocess data ---
# original = preprocess(original)
# altered = preprocess(altered)

# # Add suffix to columns
# original = original.add_suffix('_original')
# altered = altered.add_suffix('_altered')

# altered['token_original'] = altered['token_altered'].str.split('They do').str[0]

# # Merge based on token_original, note that this is not accurate as the token_original is not unique in the test dataset
# fused = pd.merge(original, altered, how='inner', on='token_original')

# # Shuffle column order
# fused = fused[['temp_match_original', 'temp_match_altered', 'token_original', 'token_altered']]

# # Save fused dataframe to csv
# fused.to_csv('fused_alternative.csv', index=False)

In [5]:
# --- Analysis ---
fused = pd.read_csv('fused_alternative.csv')

print(f"Mean Org.: {fused['temp_match_original'].mean()}")
print(f"Mean Alt.: {fused['temp_match_altered'].mean()}")

improvements = fused[fused['temp_match_altered'] < fused['temp_match_original']]
degradations = fused[fused['temp_match_altered'] > fused['temp_match_original']]

# Figure out how many times the altered temp_match is higher than the original and vice-versa
print(f"Altered worse than original: {len(improvements)}")
print(f"Original worse than altered: {len(degradations)}")
print('\n')

improvements['diff'] = improvements['temp_match_original'] - improvements['temp_match_altered']
degradations['diff'] = degradations['temp_match_altered'] - degradations['temp_match_original']

# Sort by diff
improvements = improvements.sort_values(by='diff', ascending=False)
degradations = degradations.sort_values(by='diff', ascending=False)

cprint('Top 10 improvements', 'green', attrs=['bold'])
for i in range(10):
    print(colored("Original token: ", 'blue') + improvements['token_original'].iloc[i])
    try:
        print(colored("Added info: ", 'blue') + "They do" + improvements['token_altered'].iloc[i].split('They do')[1])
    except:
        print(colored("Added info: ", 'blue') + "-")
    print('\n')

Mean Org.: 2.7129750189418695
Mean Alt.: 3.1791826616736327
Altered worse than original: 1992
Original worse than altered: 3272


[1m[32mTop 10 improvements[0m
[34mOriginal token: [0mstickman right hand wave five time with full motion from elbow to hand extension 
[34mAdded info: [0mThey do so by repeatedly bend 


[34mOriginal token: [0mthe person throw some thing forward 
[34mAdded info: [0mThey do so by extend and release their arm forward 


[34mOriginal token: [0ma figure move arm both from the center to the outward and back to the center four stroke 
[34mAdded info: [0mThey do 


[34mOriginal token: [0ma man go from have his hand up ready to fight with left foot forward to kneel on his right 
[34mAdded info: [0m-


[34mOriginal token: [0mperson stand with both foot firmly plant on the ground upper body move very slightly 
[34mAdded info: [0mThey do so by subtly 


[34mOriginal token: [0ma person walksclockwose to almost complete a cycle 
[34mAdded info: 

In [6]:
cprint('Top 10 degradations', 'red', attrs=['bold'])
for i in range(10):
    print(colored("Original token: ", 'blue') + degradations['token_original'].iloc[i])
    try:
        print(colored("Added info: ", 'blue') + "They do" + degradations['token_altered'].iloc[i].split('They do')[1])
    except:
        print(colored("Added info: ", 'blue') + "-")
    print('\n')

[1m[31mTop 10 degradations[0m
[34mOriginal token: [0ma person who is stand with his hand by his side jog counterclockwise in ever widen circle 
[34mAdded info: [0mThey do so 


[34mOriginal token: [0ma person walk slightly to the side then walk back in a slightly exaggerated manner 
[34mAdded info: [0mThey do so by sway 


[34mOriginal token: [0ma person walk diagonally and raise arm in a t pose and seem to be balance on a wide beam 
[34mAdded info: [0m-


[34mOriginal token: [0ma person pace back and forth 
[34mAdded info: [0mThey do so by alternatively shift their weight from one foot to the other 


[34mOriginal token: [0ma person walk diagonally and raise arm in a t pose and seem to be balance on a wide beam 
[34mAdded info: [0m-


[34mOriginal token: [0ma person stand on one foot hold their left hand up while move their right foot in a side to 
[34mAdded info: [0m-


[34mOriginal token: [0man individual take a long slow drink of something 
[34mAdded i