<a href="https://colab.research.google.com/github/millie-sky/Python-tutorials/blob/main/matching_intents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install FuzzyWuzzy
!pip install fuzzywuzzy



In [2]:
# For numeric computation with arrays and dataframes
import numpy as np
import pandas as pd
# pd.set_option('display.max_rows', None)  # Set to None for unlimited rows

# import the fuzzywuzzy libery for text matching
from fuzzywuzzy import process

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# For connecting to GCP
from google.colab import auth
auth.authenticate_user()
print('Authenticated')



Authenticated


In [3]:
# reading call reasons
query1 = '''
SELECT call_reason_2
FROM `skyuk-uk-ds-csg-prod.Exec_Dashboard.dashboard_final`
group by 1
'''

df1 = pd.io.gbq.read_gbq(query1, project_id= "skyuk-uk-ds-csg-prod", dialect='standard')

In [4]:
df1

Unnamed: 0,call_reason_2
0,Manage Visit
1,Competitor
2,Billing
3,Contract End
4,Confirm Cancellation
...,...
68,Technical Issue
69,Reduce Cost
70,Delivery
71,Billing / Debt


In [5]:
# reading digital intents
query2 = '''
SELECT event_detail FROM `skyuk-uk-ds-csg-prod.CSG_Insight_Test.customer_interaction_hub`
where (event_type like 'Voice%' or event_type like 'Web%' or event_type like 'App%' or event_type  like 'Interactive%')
group by 1
'''
df2 = pd.io.gbq.read_gbq(query2, project_id= "skyuk-uk-customer-pres-prod", dialect='standard')

In [6]:
# Take a look at the dataframe
df2

Unnamed: 0,event_detail
0,SKG_STO_Save_Retention_ASM
1,HELP TV_NSS
2,SKG_FSD_SER_WFH_SkyQ_MultiSkill_Voucher
3,SKG_NCL_SER_WFH_SkyQ_Multiskill_Mobile_FTTP_FOM
4,SKG_DNM_SER_NonWFM
...,...
1284,SKG_DUN_Save_BBCoE_Llama_Inbound
1285,VQ_SCT_DIALLER_CAM_ROI_SALATT
1286,SKG_TSF_SER_HomeMove_CSI
1287,SKG_FSL_ROI_Glass_Save


In [7]:
# turn the intents from a df column into a list
call_reasons = df1['call_reason_2'].dropna().tolist()
digital_intents = df2['event_detail'].dropna().tolist()

In [31]:
# have a look at the first 10 digital intents
digital_intents[:10]

['SKG_STO_Save_Retention_ASM',
 'HELP TV_NSS',
 'SKG_FSD_SER_WFH_SkyQ_MultiSkill_Voucher',
 'SKG_NCL_SER_WFH_SkyQ_Multiskill_Mobile_FTTP_FOM',
 'SKG_DNM_SER_NonWFM',
 'SKG_FSL_Upgrades_Premium_Outbound',
 'SKG_DNM_SER_WFH_Access_SkyQ_Multiskill_TPCease_VIP',
 'SKG_SOF_SER_SkyQ_Multiskill_WIFIMax',
 'SKG_NCL_SER_SkyQ_Multiskill_Mobile_FTTP_FOM',
 'MANAGE_PAYMENT']

In [52]:
### The prefix are not very useful in digital intents, let's try to clean it with regula expression

import re

# Function to clean digital intents
def clean_digital_intent(intent):
    # Remove specific prefixes
    intent = re.sub(r'^[A-Z]{3}_[A-Z]+_', '_', intent)
    # Remove patterns of "_" followed by three capital letters in the middle
    intent = re.sub(r'_[A-Z]{3}_', ' ', intent)
    # Remove patterns of "_" followed by three capital letters in the suffix
    intent = re.sub(r'_[A-Z]{3}\b', '', intent)
    # Replace underscores with spaces
    intent = intent.replace('_', ' ')
    return intent.strip()  # Remove leading and trailing spaces

# store cleaned digital intents
cleaned_digital_intents = [clean_digital_intent(intent) for intent in digital_intents]

In [53]:
# have a look at the first 10 cleaned digital intents
cleaned_digital_intents[:10]

['Save Retention',
 'HELP TV',
 'WFH SkyQ MultiSkill Voucher',
 'WFH SkyQ Multiskill Mobile FTTP',
 'NonWFM',
 'Upgrades Premium Outbound',
 'WFH Access SkyQ Multiskill TPCease',
 'SkyQ Multiskill WIFIMax',
 'SkyQ Multiskill Mobile FTTP',
 'MANAGE PAYMENT']

In [56]:
# Perform fuzzy matching

matches = []

for intent, cleaned_intent in zip(digital_intents, cleaned_digital_intents):
    best_match, score = process.extractOne(cleaned_intent, call_reasons)
    matches.append({"Digital Intent": intent, "Cleaned Digital Intent": cleaned_intent, "Best Call Reason Match": best_match, "Score": score})

# Convert matches to a DataFrame
matched_df = pd.DataFrame(matches)



In [57]:
# Sort matched_df by score from large to small
matched_df_sorted = matched_df.sort_values(by='Score', ascending=False)

In [58]:
matched_df_sorted

Unnamed: 0,Digital Intent,Cleaned Digital Intent,Best Call Reason Match,Score
868,SKG_DNM_SER_Debt,Debt,Debt,100
525,SKG_FSM_SER_Debt,Debt,Debt,100
1113,SKG_DNM_SER_DEBT_VIP,DEBT,Debt,100
461,CANCEL,CANCEL,Cancel,100
903,SKG_FSB_SER_Debt,Debt,Debt,100
...,...,...,...,...
115,SKG_FSD_NOW,,Manage Visit,0
20,SKG_HSSLEE_OTD,,Manage Visit,0
475,SKG_FSB_NOW,,Manage Visit,0
457,SKG_HSSLIV_OTD,,Manage Visit,0


In [61]:
# Select rows where Score < 50
matched_df_sorted[matched_df_sorted['Score'] < 50]

Unnamed: 0,Digital Intent,Cleaned Digital Intent,Best Call Reason Match,Score
126,SKG_LIV_SER_WFH_MOB_HOOD2,WFH HOOD2,Wifi,49
451,SKG_DNM_SER_WFH_HOOD1,WFH HOOD1,Wifi,49
683,SKG_FSY_Sales_Suspense_Outbound,Sales Suspense Outbound,Single Sports,49
892,SKG_DCS_SAL_Stream_Loyalty_Messaging,Stream Loyalty Messaging,Visit related,49
106,SKG_LIV_SER_WFH_HOOD2,WFH HOOD2,Wifi,49
...,...,...,...,...
115,SKG_FSD_NOW,,Manage Visit,0
20,SKG_HSSLEE_OTD,,Manage Visit,0
475,SKG_FSB_NOW,,Manage Visit,0
457,SKG_HSSLIV_OTD,,Manage Visit,0


In [14]:
## save the whole matched results in an excel
matched_df_sorted.to_excel('mached_intent.xlsx')
## tab on the left panel -- go to 'Files' to download this