In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
import matplotlib.pyplot as plt
from google.cloud import bigquery


In [None]:
%%bigquery mob_base

SELECT 
    BACCT_BUS_BACCT_NUM AS BAN,
    pi_prod_instnc_resrc_str AS MSISDN,
    pp_bus_pp_catlg_itm_cd AS PRICE_PLAN_CD,
    pp_catlg_itm_nm AS PRICE_PLAN_TXT,
    pp_recur_chrg_amt,
    pp_avail_for_sale_ind AS PLAN_AVAIL_FOR_SALE,
    pp_sls_start_ts AS PRICE_PLAN_START_DATE,
    pp_sls_end_ts AS PRICE_PLAN_END_DATE,
    pp_cust_facing_ind AS PLAN_CAN_BE_SOLD_TO_CUSTOMERS
FROM `cio-datahub-enterprise-pr-183a.ent_cust_cust.bq_prod_instnc_snpsht` 
WHERE 
    prod_instnc_ts = (SELECT MAX(prod_instnc_ts) FROM `cio-datahub-enterprise-pr-183a.ent_cust_cust.bq_prod_instnc_snpsht` )  -- Get most recent date in snapshot table
    AND bacct_brand_id=1 -- 1 For Telus
    AND pi_prod_instnc_typ_cd = 'C'-- Celluluar products 
    AND bacct_bacct_typ_cd = 'I' -- Consumer
    AND bacct_bacct_subtyp_cd = 'R' -- Account Sub type
    AND bacct_bacct_stat_cd = 'O' -- Billing account open  
    AND pi_prod_instnc_stat_cd = 'A' -- Status of product instance
    AND bacct_billg_mthd_cd ='POST' -- Post Pay customers only
    --AND (UPPER(pp_catlg_itm_nm) NOT LIKE '%TABLET%' OR UPPER(pp_catlg_itm_nm) NOT LIKE '%WATCH%')
    AND pp_bus_pp_catlg_itm_cd NOT IN (SELECT whsia_soc FROM `cto-wln-sa-data-pr-bb5283.ref_table.bq_whsia_soc_codes`)

In [None]:
mob_base.shape

In [None]:
soc_cd_df = pd.read_csv('soc_codes.csv')

In [None]:
headers=soc_cd_df.iloc[0,:-5]
print(headers)
clean_soc_cd_df = soc_cd_df.iloc[2:,:-5]
clean_soc_cd_df.rename(columns=dict(headers),inplace=True)

In [None]:
clean_soc_cd_df

In [None]:
%%bigquery data_allowance_df 
SELECT * 
FROM `cto-wln-sa-data-pr-bb5283.customer_personas_features.price_plan_cd_to_data_allowance`

In [None]:
data_allowance_df.head(1)

## Check codes in one column do not exist in other columns

In [None]:
for i in range(clean_soc_cd_df.shape[1]):
    print(clean_soc_cd_df.columns[i])
    for j in range(clean_soc_cd_df.shape[1]):
        
        if clean_soc_cd_df.columns[i] != clean_soc_cd_df.columns[j]:
            #print(clean_soc_cd_df.columns[j])
            num_na = clean_soc_cd_df[clean_soc_cd_df.columns[i]].isna().sum()
            if (clean_soc_cd_df[clean_soc_cd_df.columns[i]].isin(clean_soc_cd_df[clean_soc_cd_df.columns[j]]).sum() != num_na and clean_soc_cd_df[clean_soc_cd_df.columns[i]].isin(clean_soc_cd_df[clean_soc_cd_df.columns[j]]).shape[0]!=0):
                print('Codes in ' + clean_soc_cd_df.columns[i] + ' are in ' + clean_soc_cd_df.columns[j])
                
        

In [None]:
clean_soc_cd_df[clean_soc_cd_df.columns[i]].isin(clean_soc_cd_df[clean_soc_cd_df.columns[j]]).sum()

In [None]:
clean_soc_cd_df['Plan_5G'].isna().sum()

In [None]:
mob_base['price_plan_type']='other'

In [None]:
mob_base = pd.merge(mob_base, data_allowance_df, how='left', left_on='PRICE_PLAN_CD', right_on = 'pp_bus_pp_catlg_itm_cd')                

In [None]:
for col in clean_soc_cd_df.columns:
    mob_base.loc[mob_base.PRICE_PLAN_CD.isin(clean_soc_cd_df[col]), 'price_plan_type' ] = col

In [None]:
mob_base.head()

In [None]:
mob_base.groupby('price_plan_type').agg(num_customers = pd.NamedAgg(column='MSISDN', aggfunc='count'),
                                        avg_price = pd.NamedAgg(column='pp_recur_chrg_amt', aggfunc=np.mean),
                                        min_price = pd.NamedAgg(column='pp_recur_chrg_amt', aggfunc='min'),
                                        max_price = pd.NamedAgg(column='pp_recur_chrg_amt', aggfunc='max'),
                                        avg_gb_data = pd.NamedAgg(column='data_allowance_gb', aggfunc=np.mean),
                                        min_gb_data = pd.NamedAgg(column='data_allowance_gb', aggfunc='min'),
                                        max_gb_data = pd.NamedAgg(column='data_allowance_gb', aggfunc='max'))

In [None]:
price_plan_cd_df = pd.concat([clean_soc_cd_df, pd.DataFrame(mob_base.loc[mob_base.price_plan_type=='other'].PRICE_PLAN_CD.unique(), columns=['other_plans'])], axis=1)

# Plot SOC Code changes over time

In [None]:
client=bigquery.Client()

In [None]:
query = \
'''
SELECT 
  prod_instnc_ts,
  COUNT(*) as num_customers
FROM `cio-datahub-enterprise-pr-183a.ent_cust_cust.bq_prod_instnc_snpsht` 
WHERE 
  prod_instnc_ts > '2022-01-01'
  AND pp_bus_pp_catlg_itm_cd IN ({price_plan_cd_str})
  AND bacct_brand_id=1 -- 1 For Telus
  AND pi_prod_instnc_typ_cd = 'C'-- Celluluar products 
  AND bacct_bacct_typ_cd = 'I' -- Consumer
  AND bacct_bacct_subtyp_cd = 'R' -- Account Sub type
  AND bacct_bacct_stat_cd = 'O' -- Billing account open  
  AND pi_prod_instnc_stat_cd = 'A' -- Status of product instance
  AND bacct_billg_mthd_cd ='POST' -- Post Pay customers only
GROUP BY prod_instnc_ts
ORDER BY prod_instnc_ts

'''

In [None]:
def remove_sudden_spikes(df, col_name, threshold = 1.15):
    for index, row in df.iterrows():
        if index != 0:
            if (row[col_name] > df.iloc[index - 1, 1] * threshold):
                df.iloc[index, 1] = df.iloc[index - 1, 1]
                print(row[col_name])
    return df

In [None]:
df_combined=pd.DataFrame()
for col in price_plan_cd_df.columns:
    price_plan_cd_string = "'" + '\', \''.join(price_plan_cd_df.loc[~price_plan_cd_df[col].isna()][col]) + "'" 
    query_cd = query.format(price_plan_cd_str=price_plan_cd_string)
    df =client.query(query_cd).to_dataframe()
    df['price_plan_type'] = col
    df = remove_sudden_spikes(df, col_name='num_customers', threshold=1.2)
    df_combined = df_combined.append(df)

In [None]:
plt.figure(figsize=(10,10))
for i in df_combined.price_plan_type.unique():
    plt.plot(df_combined.loc[df_combined.price_plan_type==i]['prod_instnc_ts'], df_combined.loc[df_combined.price_plan_type==i]['num_customers'], label=i)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.plot(df_5g_plus['prod_instnc_ts'], num_5g_plus_df['num_5g_plus_users'])
plt.title('5G+ Adoption over time')
plt.ylabel('num customers')

In [None]:
plt.figure(figsize=(8,8))
plt.plot(df_5g['prod_instnc_ts'], df_5g['num_5g_users'])
plt.title('5G+ Adoption over time')
plt.ylabel('num customers')

## Explore 'Other' Plans

In [None]:
mob_base.loc[mob_base.PRICE_PLAN_TXT.str.upper().str.contains('WATCH'), 'price_plan_type']='WATCH'
mob_base.loc[mob_base.PRICE_PLAN_TXT.str.upper().str.contains('TABLET'), 'price_plan_type']='TABLET'
mob_base.loc[mob_base.PRICE_PLAN_TXT.str.upper().str.contains('MOBILITY FOR GOOD'), 'price_plan_type'] = 'MOBILITY FOR GOOD'

In [None]:
mob_base.PLAN_AVAIL_FOR_SALE.unique()

In [None]:
mob_base.loc[mob_base.PRICE_PLAN_TXT=='Voice 30-Unlimited Nationwide']

In [None]:
mob_base.loc[(mob_base.price_plan_type=='other') & (mob_base.PLAN_AVAIL_FOR_SALE=='N')].groupby('PRICE_PLAN_TXT').count().reset_index()[['PRICE_PLAN_TXT', 'BAN']].sort_values(by='BAN', ascending=False).head(100)

In [None]:
mob_base[mob_base.PRICE_PLAN_TXT.str.upper().str.contains('VOICE')].shape

In [None]:
mob_base[mob_base.PRICE_PLAN_TXT.str.upper().str.contains('WATCH')].shape

In [None]:
mob_base[mob_base.PRICE_PLAN_TXT.str.upper().str.contains('TABLET')].PRICE_PLAN_TXT.unique()

In [None]:
mob_base[mob_base.PRICE_PLAN_TXT.str.upper().str.contains('SENIOR')]