#Importing libraries and initial config

## Libraries

In [None]:
!pip install contractions bertopic #joblib==1.1.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
### Importing necessary libraries
import os
import pandas as pd
import numpy as np
import nltk
import string
import re
#stopwords
from nltk.corpus import stopwords
#tokenizer
from nltk.tokenize import RegexpTokenizer
#lemmatizer
from nltk.stem import WordNetLemmatizer
#Stemmer
from nltk.stem.porter import PorterStemmer

#contractions
import contractions

from bertopic import BERTopic

#gensims
import gensim
from gensim import models
from gensim.utils import simple_preprocess

# colab
from google.colab import auth
from google.cloud import bigquery

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

pd.set_option("display.max_columns", 100, "display.max_columns", 50)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Authentication

In [None]:
# authenticate colab to read from bigquery
auth.authenticate_user()

proj_name = 'pg-duke-student-capstone-v1'
proj_num = '545682952716'
proj_id =   'pg-duke-student-capstone-v1'
client = bigquery.Client(project=proj_id)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Config

In [None]:
# add all global vars here
# vintage = "2023-03"

# specify input path here - one directory above the directory where input csvs are stored
inp_path = "/content/drive/Shareddrives/DukeCapstone/data/Test CSVs for scoring/"

# specify file names for the nps and csat files
nps_inp_file = "NPS March 2023 all sends.csv"
csat_inp_file = "CSAT March 2023 all sends.csv"

# specify path where model checkpoints are stored here
inp_path_chkpts = "/content/drive/Shareddrives/DukeCapstone/results/Model Checkpoints/"

# specify all model checkpoint names here
prom_chkpt = "promoters_checkpoint"
det_chkpt = "det_checkpoint"
passive_chkpt = "passive_checkpoint"
csat_chkpt = "csat_checkpoint"

# specify input path for previously scored comments
inp_scored_path = "/content/drive/Shareddrives/DukeCapstone/results/Final saved data/"

scored_inp_file  = "all_outputs_scored.csv"

In [None]:
stop_words = stopwords.words('english')

stop_words.extend(['from', 'subject', 'use', 'policygenius', 'policy', 'genius','insurance', "pg", 'none', 'na', "able", "already", "also",
                   "was", "were", "thus", "although", "always", "bc", "because", "definitely", "could", "couldnt", 'would', "wouldnt", "wasy", "grear",
                   "aving","nope", "goo","lol","nah","hiccup", "imani","like"])

In [None]:
# add all functions here
def preprocess(text_col):
    """This function will apply NLP preprocessing lambda functions over a pandas series such as df['text'].
       These functions include converting text to lowercase, removing emojis, expanding contractions, removing punctuation,
       removing numbers, removing stopwords, lemmatization, etc."""
    
    # convert to lowercase
    text_col = text_col.apply(lambda x: ' '.join([w.lower() for w in x.split('/')]))
    
    # remove emojis
    #text_col = text_col.apply(lambda x: demoji.replace(x, ""))
    
    # expand contractions  
    text_col = text_col.apply(lambda x: ' '.join([contractions.fix(word) for word in x.split()]))

    # remove punctuation
    text_col = text_col.apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))
    
    # remove numbers
    text_col = text_col.apply(lambda x: ' '.join(re.sub("[^a-zA-Z]+", " ", x).split()))

    # remove stopwords
    stopwords = [sw for sw in stop_words if sw not in ['not', 'no']]
    text_col = text_col.apply(lambda x: ' '.join([w for w in x.split() if w not in stopwords]))

    # lemmatization
    text_col = text_col.apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(w) for w in x.split()]))

    # stemming
    #from nltk.stem import PorterStemmer
    #text_col = text_col.apply(lambda x: ' '.join([PorterStemmer().stem(w) for w in x.split()]))

    # remove short words
    text_col = text_col.apply(lambda x: ' '.join([w.strip() for w in x.split() if len(w.strip()) >= 3]))

    return text_col

# Data Ingestion

In [None]:
# read in data to be scored
df_nps = pd.read_csv(os.path.join(inp_path,nps_inp_file))
df_csat = pd.read_csv(os.path.join(inp_path,csat_inp_file))

In [None]:
display(df_nps.head())
display(df_csat.head())

Unnamed: 0,Survey Request ID,Survey Request Created At,Survey Request Dispatch At,Person ID,Person Created At,Person Email,Person Name,Latest Event,Latest Event Timestamp,Survey Response ID,Survey Response Score,Survey Response Comment,Survey Response Created At,Survey Response Notes,Survey Response Tags,Import ID,Import Size,Import Created At,Import Dispatched At,API,Channel,application_id,assignee_name,Browser,conversation_id,Country,customer_id,Delighted Source,Device Type,internal_case_tags,Operating System,Page,Page URL,product,product_type,purchase_location,Referrer Site,Referrer URL,survey_origin,ticket_id,ticket_tags,ticket_url,ticket_via,zendesk_requester_id,zendesk_ticket_id
0,3840465071,2023-03-01 12:44:52,2023-03-01 12:44:52,1319481689,2023-03-01 12:44:52,,,surveyed,2023-03-01 12:44:54,,,,,,,,,,,Yes,Email,6283736,Customer_Engagement_,,8b624ce0-97a4-4a06-881b-766c2d,,64189460,,,"[""CST"", ""Umbrella Only"", ""spousal"", ""Manual Ca...",,,,,life,,,,,,,,,,
1,3840466716,2023-03-01 12:45:57,2023-03-01 12:45:57,1319481904,2023-03-01 12:45:57,,,surveyed,2023-03-01 12:45:58,,,,,,,,,,,Yes,Email,6167246,Keenan Nessl,,2041bd09-edf5-4f78-af02-dd6193,,63891199,,,"[""EST"", ""Umbrella Only"", ""spousal"", ""Action: A...",,,,,life,,,,,,,,,,
2,3840467446,2023-03-01 12:46:26,2023-03-01 12:46:26,1319482023,2023-03-01 12:46:26,,,surveyed,2023-03-01 12:46:27,,,,,,,,,,,Yes,Email,6303816,Stephanie Sanchez,,9afbc723-9d9c-4fc3-b5e0-d2717e,,64233245,,,"[""Action: Reply to email"", ""EST"", ""Umbrella On...",,,,,life,,,,,,,,,,
3,3840467868,2023-03-01 12:46:51,2023-03-01 12:46:51,1319482086,2023-03-01 12:46:51,,,opened,2023-03-02 14:06:40,,,,,,,,,,,Yes,Email,6299880,Customer_Engagement_,,c2ce5b57-544c-45de-a53b-6624d2,,64225358,,,"[""CST"", ""Umbrella Only"", ""spousal"", ""Manual Ca...",,,,,life,,,,,,,,,,
4,3840471396,2023-03-01 12:48:52,2023-03-01 12:48:52,1319482418,2023-03-01 12:48:52,,,opened,2023-03-01 17:19:45,,,,,,,,,,,Yes,Email,6302514,Customer_Engagement_,,ed7c306d-1798-4b6c-8fc1-ce9adc,,64230716,,,"[""EST"", ""Umbrella Only"", ""spousal"", ""Manual Ca...",,,,,life,,,,,,,,,,


Unnamed: 0,Survey Request ID,Survey Request Created At,Survey Request Dispatch At,Person ID,Person Created At,Person Email,Person Name,Latest Event,Latest Event Timestamp,Survey Response ID,Survey Response Score,Survey Response Comment,Survey Response Created At,Survey Response Notes,Survey Response Tags,Import ID,Import Size,Import Created At,Import Dispatched At,API,Channel,application_id,assignee_name,conversation_id,customer_id,Delighted Browser,Delighted Device Type,Delighted Operating System,Delighted Source,internal_case_tags,lcs_prior_to_withdrawn,withdrawn_lifecycle_stage,Additional Question: Thanks for your response. Please tell us a bit more about how we could improve. (ID: text_ZH0ENW)
0,3839106812,2/28/2023 6:28,3/1/2023 6:28,1318823178,2/28/2023 6:28,,,opened,3/1/2023 6:28,,,,,,,,,,,Yes,Email,6017092,,880749b3-0588-4a16-9a3f-833696,53679412,,,,Email,,application_in_underwriting,underwriting_application_closed_withdrawn,
1,3839131418,2/28/2023 6:52,3/1/2023 6:52,1318831382,2/28/2023 6:52,,,suppressed,3/1/2023 6:53,,,,,,,,,,,Yes,Email,5983604,,1deb66cd-3e25-42e8-96ff-a47524,63398547,,,,Email,,application_in_underwriting,underwriting_application_closed_withdrawn,
2,3839172169,2/28/2023 7:23,3/1/2023 7:23,1318848721,2/28/2023 7:23,,,suppressed,3/1/2023 7:23,,,,,,,,,,,Yes,Email,5403362,,4b383ea6-eb1a-4aaa-b559-cf86db,61931594,,,,Email,,application_requested,withdrawn,
3,3839175854,2/28/2023 7:26,3/1/2023 7:26,1318850739,2/28/2023 7:26,,,surveyed,3/1/2023 7:26,,,,,,,,,,,Yes,Email,6308995,,ed6f5198-e06f-4e01-b688-726905,64243154,,,,Email,,application_requested,withdrawn,
4,3839176823,2/28/2023 7:27,3/1/2023 7:27,1318851030,2/28/2023 7:27,,,surveyed,3/1/2023 7:27,,,,,,,,,,,Yes,Email,6300749,,7aa36f8f-14d5-40d7-9d39-54534b,64227157,,,,Email,,application_not_started,withdrawn,


In [None]:
# read in previous scored data
df_scored_prev = pd.read_csv(os.path.join(inp_scored_path,scored_inp_file))

df_scored_prev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16745 entries, 0 to 16744
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   comment      16724 non-null  object
 1   group        16745 non-null  object
 2   topic        16745 non-null  int64 
 3   topic_desc   16745 non-null  object
 4   top_n_words  16745 non-null  object
 5   date         15615 non-null  object
dtypes: int64(1), object(5)
memory usage: 785.0+ KB


#Data Pre-processing

In [None]:
# remove all records with null comments for nps
df_nps_notnull = df_nps.dropna(subset=['Survey Response Comment'])

# checking for duplicates for nps
display(df_nps_notnull.loc[df_nps_notnull.duplicated(subset=['Survey Response ID'], keep=False)])

Unnamed: 0,Survey Request ID,Survey Request Created At,Survey Request Dispatch At,Person ID,Person Created At,Person Email,Person Name,Latest Event,Latest Event Timestamp,Survey Response ID,Survey Response Score,Survey Response Comment,Survey Response Created At,Survey Response Notes,Survey Response Tags,Import ID,Import Size,Import Created At,Import Dispatched At,API,Channel,application_id,assignee_name,Browser,conversation_id,Country,customer_id,Delighted Source,Device Type,internal_case_tags,Operating System,Page,Page URL,product,product_type,purchase_location,Referrer Site,Referrer URL,survey_origin,ticket_id,ticket_tags,ticket_url,ticket_via,zendesk_requester_id,zendesk_ticket_id


In [None]:
# remove all records with null comments for csat
df_csat_notnull = df_csat.dropna(subset=['Survey Response Comment'])

# checking for duplicates for csat
display(df_csat_notnull.loc[df_csat_notnull.duplicated(subset=['Survey Response ID'], keep=False)])

Unnamed: 0,Survey Request ID,Survey Request Created At,Survey Request Dispatch At,Person ID,Person Created At,Person Email,Person Name,Latest Event,Latest Event Timestamp,Survey Response ID,Survey Response Score,Survey Response Comment,Survey Response Created At,Survey Response Notes,Survey Response Tags,Import ID,Import Size,Import Created At,Import Dispatched At,API,Channel,application_id,assignee_name,conversation_id,customer_id,Delighted Browser,Delighted Device Type,Delighted Operating System,Delighted Source,internal_case_tags,lcs_prior_to_withdrawn,withdrawn_lifecycle_stage,Additional Question: Thanks for your response. Please tell us a bit more about how we could improve. (ID: text_ZH0ENW)


In [None]:
# parse dates into year, month, day
df_nps_notnull['date'] = pd.to_datetime(df_nps_notnull['Survey Response Created At'])
df_nps_notnull['day'] = df_nps_notnull['date'].dt.day
df_nps_notnull['month'] = df_nps_notnull['date'].dt.month
df_nps_notnull['year'] = df_nps_notnull['date'].dt.year

# create categories for detractors, passives, promoters
df_nps_notnull["score_categorized"] = pd.cut(df_nps_notnull['Survey Response Score'], bins=[-np.inf, 6, 8, 10],
                  labels=["detractor", "passive", "promoter"])

In [None]:
# parse dates into year, month, day
df_csat_notnull['date'] = pd.to_datetime(df_csat_notnull['Survey Response Created At'])
df_csat_notnull['day'] = df_csat_notnull['date'].dt.day
df_csat_notnull['month'] = df_csat_notnull['date'].dt.month
df_csat_notnull['year'] = df_csat_notnull['date'].dt.year

In [None]:
# splitting out data into separate dataframes based on score_categorized
df_prom = df_nps_notnull.loc[df_nps_notnull['score_categorized'] == 'promoter'].copy()
df_det = df_nps_notnull.loc[df_nps_notnull['score_categorized'] == 'detractor'].copy()
df_passive = df_nps_notnull.loc[df_nps_notnull['score_categorized'] == 'passive'].copy()

In [None]:
df_prom_clean = preprocess(df_prom["Survey Response Comment"])
df_det_clean = preprocess(df_det["Survey Response Comment"])
df_passive_clean = preprocess(df_passive["Survey Response Comment"])
df_csat_clean = preprocess(df_csat_notnull["Survey Response Comment"])

In [None]:
prom_inp_for_bert = np.array(df_prom_clean)
det_inp_for_bert = np.array(df_det_clean)
passive_inp_for_bert = np.array(df_passive_clean)
csat_inp_for_bert = np.array(df_csat_clean)

#Model Scoring

In [None]:
# load model checkpoints
prom_model = BERTopic.load(os.path.join(inp_path_chkpts, prom_chkpt))
det_model = BERTopic.load(os.path.join(inp_path_chkpts, det_chkpt))
passive_model = BERTopic.load(os.path.join(inp_path_chkpts, passive_chkpt))
csat_model = BERTopic.load(os.path.join(inp_path_chkpts, csat_chkpt))

In [None]:
# generate predictions
topics_prom, probs_prom = prom_model.transform(prom_inp_for_bert)
topics_det, probs_det = det_model.transform(det_inp_for_bert)
topics_passive, probs_passive = passive_model.transform(passive_inp_for_bert)
topics_csat, probs_csat = csat_model.transform(csat_inp_for_bert)

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# create a mapping df that stores all unique combinations of topic and topic descriptions
df_mapping = df_scored_prev.drop_duplicates(subset = ["topic","topic_desc","top_n_words"]).sort_values(by=["group","topic"])[["group","topic","topic_desc","top_n_words"]].reset_index()

In [None]:
# convert into dataframe
results_prom = pd.DataFrame({'comment': df_prom['Survey Response Comment'], 'topic': topics_prom, "date":df_prom["date"], "group":"promoter"})
results_det = pd.DataFrame({'comment': df_det['Survey Response Comment'], 'topic': topics_det, "date":df_det["date"], "group":"detractor"})
results_passive = pd.DataFrame({'comment': df_passive['Survey Response Comment'], 'topic': topics_passive, "date":df_passive["date"], "group":"passive"})
results_csat = pd.DataFrame({'comment': df_csat_notnull['Survey Response Comment'], 'topic': topics_csat, "date":df_csat_notnull["date"], "group":"csat"})

results_all = pd.DataFrame()

for results_df in [results_prom, results_det, results_csat]:
  temp = results_df.merge(df_mapping, how="left",on=["group","topic"])

  if results_all.empty:
    results_all = temp
  else:
    results_all = pd.concat([results_all, temp])

results_all = results_all[["comment", "group","topic","topic_desc","top_n_words","date"]]
results_all.head()

Unnamed: 0,comment,group,topic,topic_desc,top_n_words,date
0,"Easy process, no hassle, with knowledgeable st...",promoter,-1,Outlier,customer service - process easy - easy process...,2023-03-01 13:29:06
1,I work in the life insurance industry (and for...,promoter,-1,Outlier,customer service - process easy - easy process...,2023-03-01 15:40:37
2,Easy to work with\nTimely follw ups and updates,promoter,1,1 - Quick and easy process,easy process - process easy - easy easy - quic...,2023-03-01 19:47:06
3,They stayed on top of everything and helped me...,promoter,-1,Outlier,customer service - process easy - easy process...,2023-03-02 01:22:51
4,Great service and coordination,promoter,-1,Outlier,customer service - process easy - easy process...,2023-03-01 19:41:09
5,I really like the updates received throughout ...,promoter,1,1 - Quick and easy process,easy process - process easy - easy easy - quic...,2023-03-02 19:21:18
6,Very easy experience. Was a better quote than ...,promoter,1,1 - Quick and easy process,easy process - process easy - easy easy - quic...,2023-03-01 21:14:41
7,Friendly and helpful staff. They did a great j...,promoter,0,0 - Excellent customer service,customer service - great service - great custo...,2023-03-01 20:10:45
8,Great customer service. Very responsive to our...,promoter,2,2 - Responsive CSRs,responsive question - helpful responsive - res...,2023-03-02 16:42:59
9,Very good customer service and easy process,promoter,-1,Outlier,customer service - process easy - easy process...,2023-03-06 21:53:14


#Save outputs

In [None]:
final_scored = pd.concat([df_scored_prev, results_all])
# final_scored.csv("final_scored.csv", index = False)