In [1]:
%%time

import os
import re
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn.functional as F

#Deep Learning Libraries
import spacy
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering,pipeline

import nltk
# nltk.data.path.append('/home/mca_fix/share/nltk_data/') 
nltk.data.path.append('/home/mcafixmlpython/lib/nltk_data/') 
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()


# nlp = spacy.load("/home/mca_fix/share/en_core_web_md-3.0.0/en_core_web_md/en_core_web_md-3.0.0")
nlp = spacy.load("/home/mcafixmlpython/lib/en_core_web_md-3.0.0/en_core_web_md/en_core_web_md-3.0.0")

# Loading BERT-BASE-NLI-MEAN-TOKENS for similarity

# simi_PATH = '/home/mca_fix/share/bert-base-nli-mean-tokens/'
simi_PATH = '/home/mcafixmlpython/lib/bert-base-nli-mean-tokens/'

simi_tokenizer=AutoTokenizer.from_pretrained(simi_PATH)
simi_model=AutoModel.from_pretrained(simi_PATH,local_files_only=True)

import warnings
warnings.filterwarnings('ignore')

Some weights of the model checkpoint at /home/mcafixmlpython/lib/bert-base-nli-mean-tokens/ were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CPU times: user 8.72 s, sys: 2.96 s, total: 11.7 s
Wall time: 9.61 s


In [2]:
%%time

def similar(sent):
    tokens={'input_ids':[],'attention_mask':[]}
    for sentence in sent:
        new_tokens=simi_tokenizer.encode_plus(sentence,max_length=128,truncation=True,padding='max_length',return_tensors='pt')
        tokens['input_ids'].append(new_tokens['input_ids'][0])
        tokens['attention_mask'].append(new_tokens['attention_mask'][0])
    #reformat list of tensors into single tensor
    tokens['input_ids']=torch.stack(tokens['input_ids'])
    tokens['attention_mask']=torch.stack(tokens['attention_mask'])
    #processing tokens
    outputs=simi_model(**tokens)
    outputs.keys()
    embeddings=outputs.last_hidden_state
    attention_mask=tokens['attention_mask']
    mask=attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
    masked_embeddings=embeddings * mask
    summed = torch.sum(masked_embeddings,1)
    summed_mask=torch.clamp(mask.sum(1),min=1e-9)
    mean_pooled=summed/summed_mask
    from sklearn.metrics.pairwise import cosine_similarity
    mean_pooled=mean_pooled.detach().numpy()
    x=cosine_similarity([mean_pooled[0]],mean_pooled[1:])
    return x[0][0]

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 8.34 µs


In [3]:
from nltk.corpus import stopwords

def remove_stopwords(input_text):
    #print("in remove_stopwords\n",input_text)
       
    stopwords_list = stopwords.words('english')
    newStopWords = ['citi']
    stopwords_list.extend(newStopWords)
        
    # Some words which might indicate a certain sentiment are kept via a whitelist
    #whitelist = ["n't", "not", "no"]
      
    whitelist = ["n't"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 2]          
    return " ".join(clean_words)

### `Activity ground truth.xlsx part 1`

In [4]:
%%time 

df1 = pd.read_excel('Activity ground truth.xlsx', engine='openpyxl')
df1 = df1[["Activity Instance Id", "Activity AU Description", "Frequency of the activity"]]
df1['Frequency of the activity'] = df1['Frequency of the activity'].str.strip().str.lower()
df1.reset_index(drop=True, inplace=True)
df1.dropna(axis=0, inplace=True)

print(df1.shape)
df1.head()

(101, 3)
CPU times: user 6.8 s, sys: 25.6 ms, total: 6.83 s
Wall time: 6.83 s


Unnamed: 0,Activity Instance Id,Activity AU Description,Frequency of the activity
0,490399.0,Deploy/Implement technology solution Manage fu...,missing
1,509099.0,Follow appropriate Client Authentication proce...,missing
2,553070.0,Conduct / Review Monitoring Activities Risk As...,missing
3,576552.0,Citishare maintains a process to ensure that t...,missing
4,576628.0,"Citishare manages technical, ownership, and re...",missing


### `Activity ground truth.xlsx part 2`

In [5]:
%%time 

df2 = pd.read_excel('Activity Ground Truth Part 2.xlsx', engine='openpyxl')
df2 = df2[["Activity Instance Id", "Activity AU Description", "Frequency of the activity"]]
df2['Frequency of the activity'] = df2['Frequency of the activity'].str.strip().str.lower()
df2.reset_index(drop=True, inplace=True)
df2.dropna(axis=0, inplace=True)

print(df2.shape)
df2.head()

(150, 3)
CPU times: user 4.99 s, sys: 8.24 ms, total: 5 s
Wall time: 4.99 s


Unnamed: 0,Activity Instance Id,Activity AU Description,Frequency of the activity
0,1006483.0,"On a monthly basis, the Sales Practices Invest...",monthly basis
1,1008421.0,Global Network Risk Management Alternative Con...,quarterly basis
2,1008549.0,AML Governance : Population Identification and...,quarterly
3,1010460.0,All Citi businesses assess their Information S...,missing
4,1010779.0,"In order to facilitate member settlement, on a...",daily basis


### `Combine df1 and df2 into one df`

In [6]:
df = pd.concat(objs=[df1, df2], axis=0, ignore_index=True)
print(df.shape)
df.head()

(251, 3)


Unnamed: 0,Activity Instance Id,Activity AU Description,Frequency of the activity
0,490399.0,Deploy/Implement technology solution Manage fu...,missing
1,509099.0,Follow appropriate Client Authentication proce...,missing
2,553070.0,Conduct / Review Monitoring Activities Risk As...,missing
3,576552.0,Citishare maintains a process to ensure that t...,missing
4,576628.0,"Citishare manages technical, ownership, and re...",missing


In [7]:
when_frequency_lst = ["daily", "weekly", 
#                       "biweekly", "realtime", "event driven", # *** from control#
                      "monthly", "quarterly", "yearly", "semiannual", "semiannually",
                      "annual", # *** from activity# "ongoing", "needed", 
                      "annually", 
                      # outliers # 
                      "each month", "every month", "calendar month",  
                      # "needed basis", "required basis",
                      "ad hoc", "adhoc"]

from typing import Tuple

def activity_when(desc: str) -> Tuple[str, str, int, int]:
    """
    
    
    """
    
    # Text preprocessing
    desc = str(desc)
#     desc = re.sub('[^\w\s]', '', desc.strip().lower())
    desc = desc.lower()
    doc = nlp(desc)
    
    activity_when_answer = None
    
    
    ########################### semi/bi, semi-annually, bi-weekly... ###########################
    semi_bi = ["semi", "bi"]
    for word in when_frequency_lst:
        
        for semi_bi_word in semi_bi:
            
            if f"""{semi_bi_word}-{word}""" in desc:
        
                activity_when_answer = f"""{semi_bi_word}-{word}"""
        
    
    
    ########################### annual and quarterly basis; adhoc or weekly basis... ########################### 
    for token in doc:
        
        # Exclude last three tokens 
        try:
        
            if (token.text in when_frequency_lst) and (doc[token.i+1].pos_ == "CCONJ") and (doc[token.i+2].text in when_frequency_lst) and (doc[token.i+3].text in ["basis", "cadence"]):
                
                span = " ".join([doc[token.i].text, doc[token.i+1].text, doc[token.i+2].text, doc[token.i+3].text])

                if any(word in when_frequency_lst for word in span.split()) and (activity_when_answer is None):
  
                    activity_when_answer = span
        
        except:
            
            pass
    
    
    
    ########################### on ... basis... ########################### 
    for token in doc:
        
        # Exclude the last token 
        try:
            
            if (token.text in when_frequency_lst) and (doc[token.i+1].text in ["basis", "cadence"]):
                
                # ... as ... basis (monthly as needed basis)
                if doc[token.i-2].text in when_frequency_lst:
                    
                    span = " ".join([doc[token.i-2].text, doc[token.i-1].text, token.text, doc[token.i+1].text])
                
                # on a/an ... basis ..
                elif (doc[token.i-2].text == "on") and ((doc[token.i-1].text == "a") or (doc[token.i-1].text == "an")):
                    
                    span = " ".join([doc[token.i-2].text, doc[token.i-1].text, token.text, doc[token.i+1].text])
                
                # on ... basis ...
                elif doc[token.i-1].text == "on":
                    
                    span = " ".join([doc[token.i-1].text, token.text, doc[token.i+1].text])
                
                else:
                    
                    span = " ".join([token.text, doc[token.i+1].text])

                if any(word in when_frequency_lst for word in span.split()) and (activity_when_answer is None):

                        activity_when_answer = span
        
        except:
            
            pass
            
              
    ########################### monthly, quarterly, annually, semi-annually... ###########################       
    for token in doc:
        
        # Exclude the last token 
        try:
        
            if ((token.dep_ == "amod") and (token.pos_ == "ADJ")) or ((token.dep_ == "advmod") and (token.pos_ == "ADV")):
                
                span = token.text

                if any(word in when_frequency_lst for word in span.split()) and (activity_when_answer is None):

                    activity_when_answer = span
        
        except:
            
            pass
                
                
                
    ########################### other outliers... ###########################
    if activity_when_answer is None:
    
        for word in when_frequency_lst:
            
            if word in desc:
                
                activity_when_answer = word
    
    
    
    ########################### confidence ###########################
    if activity_when_answer is None:
        
        confidence = "Low"
        
    else:
        
        confidence = "High"
        
        
        
    ########################### starting index, ending index ###########################
    if activity_when_answer is None:
        
        start, end = None, None
    
    else:
        
        start = desc.find(activity_when_answer)
        
        if start == -1:
            
            start, end = None, None
        
        else:
            
            end = start + len(activity_when_answer)
        
    
    
    return activity_when_answer, confidence, start, end

In [8]:
%%time

# df["when_ans_freq"] = df["Activity AU Description"].apply(func=activity_when)
df[["when_ans_freq", "confidence", "start", "end"]] = df["Activity AU Description"].\
    apply(func=lambda row: activity_when(row)[0:4]).to_list()
df["found"] = (~df["when_ans_freq"].isna())
df['review_priority'] = np.where((df['found'] == False), "High Priority Review because of Missing When Frequency", "No Need to Review")      
df['missing_or_not'] = np.where((df['found'] == False), "Missing", "Not Missing")
df["when_ans_freq"] = df["when_ans_freq"].apply(func=lambda row : "missing" if row is None else row)
df["similarity"] = df.apply(func=lambda row: similar(sent=[row["Frequency of the activity"], row["when_ans_freq"]]), 
                            axis=1)

CPU times: user 6min 51s, sys: 33.3 s, total: 7min 24s
Wall time: 1min 7s


### `W exists (Actual)`

In [9]:
print(len(df[df["Frequency of the activity"] != "missing"]))
df[df["Frequency of the activity"] != "missing"].head()

180


Unnamed: 0,Activity Instance Id,Activity AU Description,Frequency of the activity,when_ans_freq,confidence,start,end,found,review_priority,missing_or_not,similarity
8,650028.0,On a daily basis Call Recording and Monitoring...,on a daily basis,on a daily basis,High,0.0,16.0,True,No Need to Review,Not Missing,1.0
45,463558.0,"Semi-Annually, Risk - Regulatory Submission CC...",semi-annually & monthly,semi-annually,High,0.0,13.0,True,No Need to Review,Not Missing,0.912689
46,463560.0,"Basel III Reporting. On a monthly basis, CRS B...",on a monthly basis,on a monthly basis,High,21.0,39.0,True,No Need to Review,Not Missing,1.0
47,463562.0,"Risk - FDIC Quarterly Submission - ""The quarte...",quarterly,quarterly,High,12.0,21.0,True,No Need to Review,Not Missing,1.0
48,463570.0,Risk - Credit Policy - MIS - On a monthly basi...,monthly,on a monthly basis,High,29.0,47.0,True,No Need to Review,Not Missing,0.959758


### `W Exists - Model Agrees with Actual`

In [10]:
tp = df[ (df["Frequency of the activity"] != "missing") & (df["when_ans_freq"] != "missing") ]
tp = tp[["Frequency of the activity", "when_ans_freq", "similarity"]]
tp = tp.loc[tp["similarity"] >= 0.80]
print(tp.shape)
tp

(178, 3)


Unnamed: 0,Frequency of the activity,when_ans_freq,similarity
8,on a daily basis,on a daily basis,1.000000
45,semi-annually & monthly,semi-annually,0.912689
46,on a monthly basis,on a monthly basis,1.000000
47,quarterly,quarterly,1.000000
48,monthly,on a monthly basis,0.959758
...,...,...,...
244,daily,on a daily basis,0.964258
245,monthly,on a monthly basis,0.959758
247,semi-annually,semi-annually,1.000000
248,monthly,monthly,1.000000


### `W does exist - Model value does not match`

In [11]:
fn = df[(df["Frequency of the activity"] != "missing") & 
         (df["when_ans_freq"] == "missing") ]
fn = fn[["Frequency of the activity", "when_ans_freq", "similarity"]]
print(fn.shape)
fn

(2, 3)


Unnamed: 0,Frequency of the activity,when_ans_freq,similarity
89,mi,missing,0.491215
171,as needed basis,missing,0.443429


### `W Does not Exist (Actual)`

In [12]:
print(len(df[df["Frequency of the activity"] == "missing"]))
df[df["Frequency of the activity"] != "missing"].head()

71


Unnamed: 0,Activity Instance Id,Activity AU Description,Frequency of the activity,when_ans_freq,confidence,start,end,found,review_priority,missing_or_not,similarity
8,650028.0,On a daily basis Call Recording and Monitoring...,on a daily basis,on a daily basis,High,0.0,16.0,True,No Need to Review,Not Missing,1.0
45,463558.0,"Semi-Annually, Risk - Regulatory Submission CC...",semi-annually & monthly,semi-annually,High,0.0,13.0,True,No Need to Review,Not Missing,0.912689
46,463560.0,"Basel III Reporting. On a monthly basis, CRS B...",on a monthly basis,on a monthly basis,High,21.0,39.0,True,No Need to Review,Not Missing,1.0
47,463562.0,"Risk - FDIC Quarterly Submission - ""The quarte...",quarterly,quarterly,High,12.0,21.0,True,No Need to Review,Not Missing,1.0
48,463570.0,Risk - Credit Policy - MIS - On a monthly basi...,monthly,on a monthly basis,High,29.0,47.0,True,No Need to Review,Not Missing,0.959758


### `W Does not exist - Model Agrees w/t Actual`

In [13]:
tn = df[(df["Frequency of the activity"] == "missing") & (df["when_ans_freq"] == "missing") ]
tn = tn[["Frequency of the activity", "when_ans_freq", "similarity"]]
print(tn.shape)
tn

(62, 3)


Unnamed: 0,Frequency of the activity,when_ans_freq,similarity
0,missing,missing,1.0
1,missing,missing,1.0
2,missing,missing,1.0
4,missing,missing,1.0
5,missing,missing,1.0
...,...,...,...
164,missing,missing,1.0
166,missing,missing,1.0
168,missing,missing,1.0
201,missing,missing,1.0


### `W does not exist – Model think it does`

In [14]:
fp = df[(df["similarity"] < 0.80) & (df["when_ans_freq"] != "missing")].loc[:, ["Frequency of the activity", "when_ans_freq", "similarity"]]
print(fp.shape)
fp

(9, 3)


Unnamed: 0,Frequency of the activity,when_ans_freq,similarity
3,missing,annual,0.366015
10,missing,each month,0.284828
21,missing,adhoc,0.49706
51,missing,monthly,0.340274
80,missing,annually,0.298647
99,missing,monthly,0.340274
169,missing,semi-annually,0.489803
243,missing,quarterly,0.392856
246,missing,daily,0.288967


### `Accuracy`

In [15]:
(178 + 62) / 251

0.9561752988047809