In [None]:
import pandas as pd
import numpy as np

Merge data with customers who roamed with CCAI Dataset

In [None]:
%%bigquery num_usa_roamers_df

SELECT
    COUNT(DISTINCT imsi_num) as num_usa_romers
FROM
    `cio-datahub-enterprise-pr-183a.ent_usage_unrated_ott.bq_ott_app_event`
WHERE 
    event_dt between "2022-03-01" AND "2022-04-20" 
    AND imsi_num like '302220%'
    AND (mcc_id = '311' OR mcc_id = '310') -- get customers who roamed in the US




In [None]:
num_usa_roamers_df

In [None]:
%%bigquery df

WITH USA_roamer_usage_data AS (

  SELECT 
    # MIN(event_dt) as first_date,
    # MAX(event_dt) as most_recent_date,
    imsi_num,     
    event_dt,
    --app_category_nm,
    SUM(dl_volume_qty/1000000.0) as dl_usage_mb --Get total usage by each imsi
  FROM
    `cio-datahub-enterprise-pr-183a.ent_usage_unrated_ott.bq_ott_app_event`
  WHERE 
    event_dt between "2022-03-01" AND "2022-04-20" 
    AND imsi_num like '302220%'
    AND (mcc_id = '311' OR mcc_id = '310') -- get customers who roamed in the US

  GROUP BY imsi_num, event_dt -- , app_category_nm
),

sentiment_score AS (
  SELECT
    call_convrstn_id,
    converstn.entity_sntmnt_scor_qty as sentiment_score
  FROM 
    `roaming-pr-66a1b0.ent_cust_intractn_ccai.bq_voice_call_insights`,  UNNEST(convrstn_entity) as converstn
)

SELECT 
  A.*,
  B.MSISDN,
  B.MOB_BAN AS BAN,
  C.call_convrstn_id,
  C.call_convrstn_dt,
  C.tot_durtn_min_qty, 
  C.convrstn_transcript_txt,
  C.SPEECH_TOPIC,
  C.convrstn_entity,
  D.sentiment_score,
  C.BUSINESS_DOMAIN
FROM USA_roamer_usage_data A  
LEFT JOIN `cto-wln-sa-data-pr-bb5283.customer_personas_features.cust_wls_mnh_mapping_TB` B  
ON A.imsi_num = B.IMSI
INNER JOIN `roaming-pr-66a1b0.ent_cust_intractn_ccai.bq_voice_call_insights` C -- Data from Mar 1 to April 20
ON B.MSISDN = C.usr_tel_num AND C.call_convrstn_dt BETWEEN A.event_dt AND DATE_ADD(A.event_dt, INTERVAL 5 DAY)
LEFT JOIN sentiment_score D
ON C.call_convrstn_id = D.call_convrstn_id
WHERE C.call_convrstn_id IS NOT NULL


In [None]:
df.sort_values(by=['imsi_num', 'event_dt', 'call_convrstn_dt'])
    

In [None]:
df_2 = df.drop_duplicates('call_convrstn_id').reset_index(drop=True)

In [None]:
df_2

In [None]:
df_2['call_subject']='NON-TECHNICAL'
df_2.loc[(df_2.BUSINESS_DOMAIN.isin(['wireless', 'WIRELESS'])) & ((df_2.SPEECH_TOPIC.str.contains('TECH|REPAIR', regex=True)) | (df_2.convrstn_transcript_txt.str.contains('SLOW|DATA|ISSUES|NETWORK|DROPPING|DROPPED|DROP|CALLS|NO SERVICE|RESTART|NO CALLS', regex=True,case=False))), 'call_subject'] = 'TECHNICAL'

stats_df = df_2.groupby('call_subject').agg({'imsi_num' : 'count', 'tot_durtn_min_qty' : 'mean', 'sentiment_score' : 'mean', 'dl_usage_mb' : 'mean'}).sort_values(by='imsi_num',ascending=False).nlargest(10, 'imsi_num').reset_index()
#stats_df.imsi_num =stats_df.imsi_num / df.groupby('SPEECH_TOPIC')['imsi_num'].count().sum()
stats_df.rename(columns={'imsi_num' : 'number of calls', 'tot_durtn_min_qty' : 'avg_call_duration' , 'sentiment_score' : 'avg_sentiment_score', 'dl_usage_mb' : 'avg_dl_usage_mb'})

In [None]:
pd.set_option('display.max_colwidth', 10)

In [None]:
df_2.loc[(df_2.BUSINESS_DOMAIN.isin(['wireless', 'WIRELESS'])) & (df_2.SPEECH_TOPIC.str.contains('TECH|REPAIR', regex=True)) & (df_2.convrstn_transcript_txt.str.contains('SLOW|DATA|ISSUES|NETWORK|DROPPING|DROPPED|DROP|CALLS|NO SERVICE|RESTART|NO CALLS', regex=True,case=False))]

In [None]:
df.to_pickle('usa_roaming_data.pkl')

In [None]:
df = pd.read_pickle('usa_roaming_data.pkl')

Total number of calls between Mar 1 and April 20 with a speech topic

In [None]:
len(df)

In [None]:
df.groupby('SPEECH_TOPIC')['imsi_num'].count().sum()

In [None]:
df.loc[(df.BUSINESS_DOMAIN.isin(['wireless', 'WIRELESS'])) & ((df.SPEECH_TOPIC.str.contains('TECH|REPAIR', regex=True)))].SPEECH_TOPIC.unique()

In [None]:
df['call_subject']='NON-TECHNICAL'
df.loc[(df.BUSINESS_DOMAIN.isin(['wireless', 'WIRELESS'])) & (df.SPEECH_TOPIC.str.contains('TECH|REPAIR', regex=True)), 'call_subject'] = 'TECHNICAL'

In [None]:
num_usa_roamers_df.values[0][0]

In [None]:
stats_df = df.groupby('call_subject').agg({'imsi_num' : 'count', 'tot_durtn_min_qty' : 'mean', 'sentiment_score' : 'mean', 'dl_usage_mb' : 'mean'}).sort_values(by='imsi_num',ascending=False).nlargest(10, 'imsi_num').reset_index()
#stats_df.imsi_num =stats_df.imsi_num / df.groupby('SPEECH_TOPIC')['imsi_num'].count().sum()
stats_df.rename(columns={'imsi_num' : 'number of calls', 'tot_durtn_min_qty' : 'avg_call_duration' , 'sentiment_score' : 'avg_sentiment_score', 'dl_usage_mb' : 'avg_dl_usage_mb'})

In [None]:
print('number of USA roamers: '+ str(num_usa_roamers_df.values[0][0]))
print('number of calls to client agents: ' + str(len(df)))

display(stats_df)

In [None]:
stats_df = df.groupby('SPEECH_TOPIC').agg({'imsi_num' : 'count', 'tot_durtn_min_qty' : 'mean', 'sentiment_score' : 'mean', 'dl_usage_mb' : 'mean'}).sort_values(by='imsi_num',ascending=False).nlargest(10, 'imsi_num').reset_index()
#stats_df.imsi_num =stats_df.imsi_num / df.groupby('SPEECH_TOPIC')['imsi_num'].count().sum()
stats_df.rename(columns={'imsi_num' : 'number of calls', 'tot_durtn_min_qty' : 'avg_call_duration' , 'sentiment_score' : 'avg_sentiment_score', 'dl_usage_mb' : 'avg_dl_usage_mb'})

In [None]:
stats_df

In [None]:
df.BUSINESS_DOMAIN.unique()

In [None]:
for col in df.loc[df.BUSINESS_DOMAIN.isin(['wireless', 'WIRELESS'])].SPEECH_TOPIC.unique().astype(str):
    if ('TECH' in col) |  ('REPAIR' in col):
        print(col)

In [None]:
df.loc[df.SPEECH_TOPIC=='REQUEST_TECH_SUPPORT'].convrstn_transcript_txt