<a href="https://colab.research.google.com/github/millie-sky/Python-tutorials/blob/main/matching_intents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install FuzzyWuzzy
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [3]:
# For numeric computation with arrays and dataframes
import numpy as np
import pandas as pd
# pd.set_option('display.max_rows', None)  # Set to None for unlimited rows

# import the fuzzywuzzy libery for text matching
from fuzzywuzzy import process

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# For connecting to GCP
from google.colab import auth
auth.authenticate_user()
print('Authenticated')



Authenticated


In [5]:
# reading call reasons
query1 = '''
SELECT call_reason_2
FROM `skyuk-uk-ds-csg-prod.Exec_Dashboard.dashboard_final`
group by 1
'''

df1 = pd.io.gbq.read_gbq(query1, project_id= "skyuk-uk-ds-csg-prod", dialect='standard')

In [6]:
df1

Unnamed: 0,call_reason_2
0,Manage Visit
1,Competitor
2,Billing
3,Contract End
4,Confirm Cancellation
...,...
68,Technical Issue
69,Reduce Cost
70,Delivery
71,Billing / Debt


In [7]:
# reading digital intents
query2 = '''
SELECT event_detail FROM `skyuk-uk-ds-csg-prod.CSG_Insight_Test.customer_interaction_hub`
where (event_type like 'Voice%' or event_type like 'Web%' or event_type like 'App%' or event_type  like 'Interactive%')
group by 1
'''
df2 = pd.io.gbq.read_gbq(query2, project_id= "skyuk-uk-customer-pres-prod", dialect='standard')

In [46]:
# Take a look at the dataframe
df2

Unnamed: 0,event_detail
0,SKG_STO_Save_Retention_ASM
1,HELP TV_NSS
2,SKG_FSD_SER_WFH_SkyQ_MultiSkill_Voucher
3,SKG_FSY_Sales_Third Party_Outbound
4,SKG_FSL_Broadband_Recontracting_Outbound
...,...
1284,Direct to SG2_CP_UK_Support
1285,SKG_TSF_SER_Mobile_BU_Billing
1286,Direct to SG2_CP_UK_Core
1287,SKG_NCL_Sell_DORT_Inbound_Home


In [27]:
# turn the intents from a df column into a list
call_reasons = df1['call_reason_2'].dropna().tolist()
digital_intents = df2['event_detail'].dropna().tolist()

In [28]:
# have a look at the first 10 digital intents
digital_intents[:10]

['SKG_STO_Save_Retention_ASM',
 'HELP TV_NSS',
 'SKG_FSD_SER_WFH_SkyQ_MultiSkill_Voucher',
 'SKG_FSY_Sales_Third Party_Outbound',
 'SKG_FSL_Broadband_Recontracting_Outbound',
 'SKG_LIV_SER_HomeMove_BU_Mobile',
 'SKG_DNM_SER_NonWFM',
 'SKG_NCL_SER_WFH_SkyQ_Multiskill_Mobile_FTTP_FOM',
 'SKG_FSL_Upgrades_Premium_Outbound',
 'SKG_DNM_SER_WFH_Access_SkyQ_Multiskill_TPCease_VIP']

In [41]:
### The prefix are not very useful in digital intents, let's try to clean it with regula expression

import re

# Function to clean digital intents
def clean_digital_intent(intent):
    # Remove patterns of three capital letters followed by "_" in the prefixes
    intent = re.sub(r'[A-Z]{3}_', '', intent)
    # Remove suffixes
    intent = re.sub(r'_[A-Z_]+$', '', intent)
    # Replace underscores with spaces
    intent = intent.replace('_', ' ')
    return intent

# store cleaned digital intents
cleaned_digital_intents = [clean_digital_intent(intent) for intent in digital_intents]

In [42]:
# have a look at the first 10 cleaned digital intents
cleaned_digital_intents[:10]

['Save Retention',
 'HELP TV',
 'SkyQ MultiSkill Voucher',
 'Sales Third Party Outbound',
 'Broadband Recontracting Outbound',
 'HomeMove BU Mobile',
 'NonWFM',
 'SkyQ Multiskill Mobile',
 'Upgrades Premium Outbound',
 'Access SkyQ Multiskill TPCease']

In [43]:
# Perform fuzzy matching

matches = []

for intent, cleaned_intent in zip(digital_intents, cleaned_digital_intents):
    best_match, score = process.extractOne(cleaned_intent, call_reasons)
    matches.append({"Digital Intent": intent, "Cleaned Digital Intent": cleaned_intent, "Best Call Reason Match": best_match, "Score": score})

# Convert matches to a DataFrame
matched_df = pd.DataFrame(matches)

In [44]:
matched_df

Unnamed: 0,Digital Intent,Cleaned Digital Intent,Best Call Reason Match,Score
0,SKG_STO_Save_Retention_ASM,Save Retention,Activation,50
1,HELP TV_NSS,HELP TV,Other Technical Issue,51
2,SKG_FSD_SER_WFH_SkyQ_MultiSkill_Voucher,SkyQ MultiSkill Voucher,Sky Id,60
3,SKG_FSY_Sales_Third Party_Outbound,Sales Third Party Outbound,other,54
4,SKG_FSL_Broadband_Recontracting_Outbound,Broadband Recontracting Outbound,Broadband Connection,86
...,...,...,...,...
1283,Direct to SG2_CP_UK_Support,Direct to SG2 CP UK Support,Reduce Cost,50
1284,SKG_TSF_SER_Mobile_BU_Billing,Mobile BU Billing,Billing,90
1285,Direct to SG2_CP_UK_Core,Direct to SG2 CP UK Core,Order,68
1286,SKG_NCL_Sell_DORT_Inbound_Home,Sell DInbound Home,Home Move,86


In [45]:
matched_df.to_excel('mached_intent.xlsx')
## tab on the left panel -- go to 'Files' to download this