## Label conversation chunks & build features

In [None]:
# import global modules
import os
import re
import sys
import time
import json
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from yaml import safe_load
import google.oauth2.credentials
from google.cloud import bigquery
from IPython.core.display import HTML
from IPython.core.display import display
from IPython.display import clear_output

# Set global vars
pth_project = Path(os.getcwd().split('notebooks')[0])
pth_data = pth_project / 'data'
pth_util_data = pth_project / 'core' / 'utils' / 'data'
pth_queries = pth_project / 'core' / 'utils' / 'queries' / 'common'
pth_creds = pth_project / 'conf' / 'local' / 'project_config.yaml'
sys.path.insert(0, str(pth_project))
d_config = safe_load(pth_creds.open())
d_params = safe_load((pth_project / 'core' / 'parameters' / 'movers.yaml').open())

# import local modules
from core.utils.gcp import connect_bq_services, connect_storage_services
from core.etl.movers.extract import extract_all_model_data
from core.etl.load import load_examples_to_datahub
from core.etl.extract import extract_bq_data
from core.etl.text.transform import sub_tokens, get_match_regex, is_match_regex
from core.etl.movers.transform.features import process_conv, extract_convs_features
from core.models.movers import MoveDeepClf

# Connect to google services
bq_client = connect_bq_services(d_config['gcp-project-name'])
storage_client = connect_storage_services(d_config['gcp-project-name'])

In [None]:
%load_ext autoreload
%autoreload 2

#### 1. Extract data

##### A Extract raw data

In [None]:
# Extract conversation and sentences 
df_sentences = pd.read_csv(pth_data / 'extract' / 'sentences.csv', index_col=None)

# Add a unique chunk_id to all sentences 
df_ori_sentences = df_sentences.sort_values(by=['call_convrstn_id', 'sntnce_ts'])\
    .assign(chunk_id=np.arange(len(df_sentences)))

# Extract model data
d_regex, l_tags, d_stopwords, intent_detector = extract_all_model_data(
    pth_util_data, split_regex=True, **d_params['model']
)

#### 2. Preliminary sizing 

In [None]:
# Get all family match
l_matches = []
for _, row in df_ori_sentences.iterrows(): 
    l_move_match = get_match_regex(row['sntnce'], d_regex['movers'])
    
    if not l_move_match or row['sntnce_partcpnt_role'] != 'END_USER':
        continue
    
    l_cancel_match = get_match_regex(row['sntnce'], d_regex['cancel'])
    l_holiday_match = get_match_regex(row['sntnce'], d_regex['holiday'])
    l_expression_match = get_match_regex(row['sntnce'], d_regex['expression'])
    l_things_match = get_match_regex(row['sntnce'], d_regex['things'])
    l_negation_regex = get_match_regex(row['sntnce'], d_regex['negation'])
    l_matches.append({
        'call_convrstn_id': row['call_convrstn_id'], 'chunk_id': row['chunk_id'], 
        'text': row['sntnce'], 'mover_match': ';'.join(l_move_match), 
        'cancel_match': ';'.join(l_cancel_match), 'holiday_match': ';'.join(l_holiday_match), 
        'expression_match': ';'.join(l_expression_match), 'things_match': ';'.join(l_things_match), 
        'negation_match': ';'.join(l_negation_regex)
    })

# format & save
df_movers = pd.DataFrame(l_matches)
df_movers.to_excel(pth_data / 'adhoc' / 'movers_match.xlsx', index=False)  

In [None]:
df_conversations = pd.read_csv(pth_data / 'extract' / 'conversations.csv', index_col=None)\
    .loc[:, ['call_convrstn_id', 'bus_bacct_num', 'call_convrstn_date']]
df_movers = df_movers.merge(df_conversations, on='call_convrstn_id', how='left')

l_info = []
for date, df_sub in df_movers.groupby('call_convrstn_date'):
    l_info.append({
        'date': date,
        'cnt_unique_conv_id': len(df_sub['call_convrstn_id'].unique()), 
        'cnt_unique_ban': len(df_sub.loc[~df_sub['bus_bacct_num'].isnull(), 'bus_bacct_num'].unique())        
    })
                                  
pd.DataFrame(l_info).to_excel(
    pth_data / 'adhoc' / 'movers_info.xlsx', index=False
)  

#### 3. extract examples

##### A. Get BQ data

In [None]:
batch_rate = 1000
n_pass = int(len(df_examples) / batch_rate) + 1
for i in range(n_pass):
    df_sub = df_examples.iloc[i * batch_rate: (i+1) * batch_rate]

    load_examples_to_datahub(
        bq_client, df_sub, pth_queries, d_project_config['gcp-project-name'], 
        d_project_config['dataset'], d_params['labelling']['table_name']
    )
    time.sleep(1)
    count = extract_bq_data(
        bq_client, 
        '''SELECT count(*) 
        from `divg-pgspeech-pr-b8a291.divg_pgspeech_pr_dataset.examples_contract_end`'''
    )
    print(f'Count of rows is {count.iloc[0, 0]}')
    print(f'{i}-{i * batch_rate}-{(i+1) * batch_rate}')

##### B. Export as excel file

In [None]:
# Extract data
df_examples = extract_bq_data(
    bq_client, 
    '''SELECT *
        from `divg-pgspeech-pr-b8a291.divg_pgspeech_pr_dataset.examples_mover`
    '''
)

# Add ind and reformat dataframe
df_examples = df_examples.assign(
    text=lambda df: df['text'].str.replace('\\033\[1m|\\033\[0m', '')   
)

# Save ref & ano
df_examples[['chunk_id', 'text']].to_excel(
    pth_data / 'labelling' / 'movers_v0_ano.xlsx', index=False
)  

df_examples.to_csv(
    pth_data / 'labelling' / 'movers_v0_ref.csv', index=False, sep=';'
)
df_ori_sentences.to_csv(
    pth_data / 'labelling' / 'movers_v0_sentences.csv', index=False, sep=';'
)

#### 4. Build dataset for training

##### A Extract annotated data

In [None]:
pth_annotations = pth_data / 'labelling' / 'training_data_movers.csv'
df_annotations = pd.read_csv(pth_annotations, index_col=None, sep=';')

In [None]:
l_sentences, d_labels = [], {}
for _, row in df_annotations.iterrows():
    if pd.isnull(row['label']):
        continue
    
    # Update label's dict
    label = 0 if row['label'] == 'N' else 1
    d_labels[row['conv_id']] = d_labels.get(row['conv_id'], None) or {}
    d_labels[row['conv_id']][str(row['chunk_id'])] = label
    
    # Update sentences 
    l_sentences.append({
        "sntnce": row['sntnce'], 'call_convrstn_id': row['conv_id'], 
        'chunk_id': str(row['chunk_id']), 'sntnce_partcpnt_role': "END_USER"
    })
    
df_sentences = pd.DataFrame(l_sentences)    

In [None]:
# Extract model data
d_regex, l_tags, d_stopwords, intent_detector = extract_all_model_data(
    pth_util_data, split_regex=True, **d_params['model']
)
l_regex = [r for l_regs in d_regex.values() for r in l_regs]

##### B Compute features

In [None]:
# Compute features
mover_dataset = extract_convs_features(
    df_sentences, d_regex, intent_detector, d_labels=d_labels, 
    d_stopwords=d_stopwords, l_tags=l_tags
)

# Save features
with (pth_data / 'training' /'mover_dataset.pkl').open(mode='wb') as f:
    pickle.dump(mover_dataset, f)

In [None]:
# Show some stats
training_data_desc = f"""
    Size of training dataset: {sum([len(d) for d in d_labels.values()])}
    Size of target 0: {sum([len([v for v in d.values() if v == 0]) for d in d_labels.values()])}
    Size of target 1: {sum([len([v for v in d.values() if v != 0]) for d in d_labels.values()])}
    Feature dim: {mover_dataset.X.shape}
"""
print(training_data_desc)