# Is there a correlation between random string names like "ifugeiwug" and being part of an extended case?

In [1]:
import pandas as pd
import numpy as np

import math
import pickle
import sys

In [2]:
sys.path.append('../')
import utils

In [3]:
v_sample, e_sample, core_sample, ext_sample = utils.load_for_jupyter_raw()

Dataset already downloaded. Loading it from file system
LOADING DATA: 1.08 s


In [4]:
unique_names = v_sample['Name'].value_counts().index

In [12]:
names = [x.lower() for x in unique_names]

In [13]:
accepted_chars = 'abcdefghijklmnopqrstuvwxyz '

pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])

def ngram(n, l):
     """ Return all n grams from l after normalizing """
     filtered = normalize(l)
     for start in range(0, len(filtered) - n + 1):
         yield ''.join(filtered[start:start + n])
        
def normalize(line):
     """ Return only the subset of chars from accepted_chars.
     This helps keep the  model relatively small by ignoring punctuation, 
     infrequenty symbols, etc. """
     return [c.lower() for c in line if c.lower() in accepted_chars]

In [14]:
def avg_transition_prob(l, log_prob_mat):
    """ Return the average transition prob from l through log_prob_mat. """
    log_prob = 0.0
    transition_ct = 0
    for a, b in ngram(2, l):
        log_prob += log_prob_mat[pos[a]][pos[b]]
        transition_ct += 1
    # The exponentiation translates from log probs to probs.
    return math.exp(log_prob / (transition_ct or 1))

In [15]:
model_data = pickle.load(open('./gib_model.pki', 'rb'))
names_readable = dict()
for l in names:
    model_mat = model_data['mat']
    threshold = model_data['thresh']
    names_readable[l] = avg_transition_prob(l, model_mat) > threshold

In [16]:
df = pd.DataFrame.from_dict(orient='index', data=names_readable, columns=['human_readable'])

In [103]:
df.shape

(142871, 1)

In [102]:
df.value_counts()

human_readable
False             104666
True               38205
dtype: int64

In [24]:
def hasNumbers(s):
    return any(c.isdigit() for c in s)

In [61]:
rejected_names = df.loc[df['human_readable'] == False].index

In [72]:
rejected_names_no_numbers = [x if not hasNumbers(x) else np.NaN for x in rejected_names]
rej_df = pd.DataFrame(rejected_names_no_numbers, columns=['name'])
rej_df = rej_df.dropna()
rejected_names_no_numbers = rej_df['name'].values

In [73]:
rejected_names_no_numbers

array(['li jie', 'mr. isaac elbaz', 'liu jie', ..., 'ljmgjhwtvf',
       'rezqbjqdsb', 'ykbhygaobo'], dtype=object)

In [93]:
import re
pattern = r"\s"
final_rejected = [x if len(re.findall(pattern, x)) != 0 else np.NaN for x in rejected_names_no_numbers]
rej_df2 = pd.DataFrame(final_rejected, columns=['name'])
rej_df2 = rej_df2.dropna()
wrongly_rejected = rej_df2['name'].values

In [97]:
readable = df.loc[df['human_readable'] == True]
readable = readable.index

In [98]:
readable = np.concatenate((wrongly_rejected, readable))

In [99]:
len(readable)

38905

In [100]:
readable

array(['li jie', 'mr. isaac elbaz', 'liu jie', ..., 'drt lease limited',
       'rajeev singh', 'mok ka cheong, lawrence'], dtype=object)

In [106]:
print(f"I found {df.shape[0]} different names. \n Out of these, {df.shape[0]-len(readable)} are not humanly unreadable.")

I found 142871 different names. 
 Out of these, 103966 are not humanly unreadable.


In [109]:
v_sample['Name'].values

array([nan, nan, nan, ..., 'izrJE4sDpr', 'TKQfFZ3fkk', 'Tl8KrwulsB'],
      dtype=object)

In [115]:
v_sample['Is readable'] = [True if str(name).lower() in readable else False for name in v_sample['Name'].values]

In [119]:
v_sample.head()

Unnamed: 0_level_0,Label,Revenue Size Flag,Account ID String,Address,Person or Organisation,Name,Income Size Flag,CoreCaseGraphID,ExtendedCaseGraphID,testingFlag,Is readable
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1502000,Account,mid_high,RvIOFQqK0E,,,,,,,,False
1502001,Account,low,cSnM0hVDsm,,,,,,,,False
1502002,Account,low,WAQWpZi4AD,,,,,2492.0,,0.0,False
1502003,Account,mid_high,n5J9mBTeZc,,,,,,,,False
1502004,Account,low,qxlAEuUm7P,,,,,,,,False


In [118]:
v_sample['Is readable'].value_counts()

False    275842
True      43534
Name: Is readable, dtype: int64

In [125]:
readable_items = v_sample.loc[v_sample['Is readable'] == True ]
unreadable_items = v_sample.loc[v_sample['Is readable'] == False ]

In [131]:
v_sample['ExtendedCaseGraphID'].notna().sum()

32438

In [126]:
readable_items['ExtendedCaseGraphID'].notna().sum()

7239

In [129]:
unreadable_items['ExtendedCaseGraphID'].notna().sum()

25199

In [127]:
# 7239 readable nodes with a case out of 43534 readable nodes
7239/43534

0.1662838241374558

In [130]:
# 25199 unreadable nodes with a case out of 275842
25199/275842

0.09135302093227282

In [132]:
7239/32438

0.22316419014735803

In [133]:
25199/32438

0.776835809852642

### Conclusion: 16.6% of nodes with a readable name are part of a case, and only 9.1% with an unreadable one are part of a case.

### I was expecting the opposite. This conclusion means that the random strings were probably originated from a protection or encryption table.

### Nonetheless, this could be a useful feature to add to the dataset via one-hot encoding: "name is readable"