In [331]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.collocations import BigramAssocMeasures
from nltk.collocations import BigramCollocationFinder
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mehamehta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mehamehta/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Defining size of working subset

In [252]:
subset = 100000

In [253]:
data = pd.read_csv('data/CTQ1_anonymized.csv', index_col = [0])
data = data[:subset]
data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,FISC_QTR_VAL,FISC_WEEK_VAL,Group_name,Region_name,Sub_Region_name,Case_number,Chat_Create_date,Chat_Txnsrpt_body
0,2021-Q1,2021-W01,CBS-Commercial,AMERICAS,United States,49861514.0,2/4/2020 20:58,"<p align=center>Chat NAME: {{NAME}}, {{NAME}} ..."
1,2021-Q1,2021-W01,CBS-Commercial,AMERICAS,United States,49771157.0,2/3/2020 21:24,"<p align=center>Chat NAME: {{NAME}}, {{NAME}} ..."
2,2021-Q1,2021-W01,CBS-Commercial,AMERICAS,United States,49769079.0,2/3/2020 20:50,"<p align=center>Chat NAME: {{NAME}}, {{NAME}} ..."
3,2021-Q1,2021-W01,CBS-Commercial,AMERICAS,United States,49687660.0,2/3/2020 1:27,"<p align=center>Chat NAME: {{NAME}}, {{NAME}} ..."
4,2021-Q1,2021-W01,CBS-Commercial,AMERICAS,United States,50007941.0,2/6/2020 13:59,"<p align=center>Chat NAME: {{NAME}}, {{NAME}} ..."


# Separating Agent from Customer

In [267]:
def chat_parse(body):
    bodystr = str(body)
    chat = bodystr.replace("{{NAME}}", "").replace("NAME", "")
    splits = re.split('\( [0-9]*m* [0-9]+s \)', chat)
    split_first = splits[0]
    early_chats = re.split('\( [0-9]+s \)', split_first)
    early_chats.extend(splits[1:])
    return early_chats

In [268]:
# cleaned.iloc['Parsed'] = chat_parse(data.iloc[i]['Chat_Txnsrpt_body'])
cleaned_chats = data['Chat_Txnsrpt_body'].apply(lambda x : chat_parse(x))

In [269]:
cleaned_chats[0]

['<p align=center>Chat : ,  04, 2020, 12:59:24 (-0800)</p><p align=center>Chat : NA.TS.CLI.CHAT.EN.CORE.COM.OPTLAT</p><p align=center>Agent  V</p>',
 '  : Thank you for contacting   . My Name is . Please give me a moment while I review your case details. ',
 '  : Hello , How are you doing today? ',
 ' : I&#39;m well thank you. ',
 '  : Glad to hear :) ',
 '  : I see that you are experiencing issue with your system as its  is failed, right? ',
 ' : correct. ',
 '  : I apologize for any inconvenience this may have caused you. ',
 '  : As we are connected, I am going to take care of this issue and will make every effort that is possible from my end to fix this issue to your satisfaction ',
 '  : Can you confirm that the system you need support with is SERVICETAGE 7480, with   : SERVICETAG? ',
 ' : Thank you. Yes that information and service tag is correct. ',
 '  : Thank you for your confirmation. Your system still has an active   valid until XX/XX/XXXX. ',
 '  : If we get disconnected fr

In [270]:
data['Agent_chat'] = cleaned_chats.apply(lambda x: x[1::2])
data['Customer_chat'] = cleaned_chats.apply(lambda x: x[2::2])

In [271]:
data['Agent_chat'][0]

['  : Thank you for contacting   . My Name is . Please give me a moment while I review your case details. ',
 ' : I&#39;m well thank you. ',
 '  : I see that you are experiencing issue with your system as its  is failed, right? ',
 '  : I apologize for any inconvenience this may have caused you. ',
 '  : Can you confirm that the system you need support with is SERVICETAGE 7480, with   : SERVICETAG? ',
 '  : Thank you for your confirmation. Your system still has an active   valid until XX/XX/XXXX. ',
 ' : yes that is my correct contact info. ',
 ' : 8am-5pm  ',
 '  : Please tell me if you&#39;ve performed any additional steps on your own apart from EPSA test. So that we may skip those and save some time &amp; trouble. ',
 '  : Let me quickly see what steps we can perform. ',
 ' : my user reported fan error message']

In [272]:
data['Customer_chat'][0]

['  : Hello , How are you doing today? ',
 '  : Glad to hear :) ',
 ' : correct. ',
 '  : As we are connected, I am going to take care of this issue and will make every effort that is possible from my end to fix this issue to your satisfaction ',
 ' : Thank you. Yes that information and service tag is correct. ',
 '  : If we get disconnected from this chat, can I reach you at {{PHONE}} or email you at {{EMAIL}}? ',
 '  : Could you confirm the best time to reach you along with your time zone? ',
 '  : Thanks for all the info.We should be done in about 14-15 minutes. ',
 ' : I have not do you need the bios revision? ',
 ' : bios version is 1.16.1 ']

# Creating dictionary of keywords

### Unigrams

In [358]:
ignored_words = list(stopwords.words('english'))
ignored_words.extend('''thank please name phone 39 yes moment email contacting give get need may let reach
                        confirm number chat disconnected hello today could still good know minutes xx hi okay
                        send apologize best work inconvenience thanks already well along concern check
                        follow would use also ok one sure like contact xxxx see ask asking questions steps great
                        fine amp together right back ways done chatting receive go us else case correct time
                        service help allow day understand confirmation anything support assist information tag
                        servicetag customer survey feedback agent servicetage feel free days business mins next
                        cte resolution servicetagized issue issues respond chats 7490 wait i\\x1all i\\x1am
                        quick move user details review prosupport address quot'''.split())

In [359]:
count_vec = CountVectorizer(
    ngram_range = (1,1)
    ,stop_words = ignored_words
)
text_set     = [str(chat).lower() for chat in data['Agent_chat']]
tf_result    = count_vec.fit_transform(text_set)
tf_result_df = pd.DataFrame(tf_result.toarray(),columns=count_vec.get_feature_names())
the_sum_s = tf_result_df.sum(axis=0)
the_sum_df = pd.DataFrame({'keyword':the_sum_s.index,'tf_sum':the_sum_s.values})

#include only if it appears more than twice
the_sum_df = the_sum_df[the_sum_df['tf_sum']>2].sort_values(by=['tf_sum'],ascending=False)



In [360]:
#include only if string longer than two characters
my_word_df = the_sum_df.iloc[:]
my_word_df = the_sum_df[my_word_df['keyword'].str.len()>2]

In [361]:
my_word_df[:20]

Unnamed: 0,keyword,tf_sum
44505,system,72608
11766,battery,18923
34703,power,15003
12263,bios,13634
15296,computer,13254
27826,laptop,13224
18457,dispatch,13089
47403,update,11678
13948,center,11600
48557,warranty,11379


### Bigrams

In [362]:
text_set_words  = [word_tokenize(str(chat).lower()) for chat in data['Agent_chat']]
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_documents(text_set_words)
finder.apply_freq_filter(3)
finder.apply_word_filter(lambda w: len(w) < 3 or len(w) > 15 or w.lower() in ignored_words)
phrase_result = finder.nbest(bigram_measures.raw_freq, 200)
colloc_strings = [w1+' '+w2 for w1,w2 in phrase_result]

In [363]:
phrase_result

[('active', 'valid'),
 ('file', 'transfer'),
 ('hard', 'drive'),
 ('power', 'button'),
 ('transfer', 'succeeded.'),
 ('physical', 'damage'),
 ('docking', 'station'),
 ('tech', 'expert'),
 ('operating', 'system'),
 ('accidental', 'damage'),
 ('error', 'code'),
 ('swollen', 'battery'),
 ('factory', 'installed'),
 ('hard', 'reset'),
 ('enter', 'key'),
 ('installed', 'operating'),
 ('blue', 'screen'),
 ('external', 'monitor'),
 ('repair', 'center'),
 ('personally', 'look'),
 ('staying', 'connected'),
 ('power', 'supply'),
 ('inactive', 'state'),
 ('onsite', 'tech'),
 ('rsm', 'approval'),
 ('error', 'message'),
 ('provided', 'excellent'),
 ('power', 'cord'),
 ('take', 'care'),
 ('device', 'manager'),
 ('safe', 'mode'),
 ('validation', 'code'),
 ('bios', 'update'),
 ('bios', 'version'),
 ('rest', 'assured'),
 ('handling', 'multiple'),
 ('remote', 'access'),
 ('power', 'adapter'),
 ('session', 'active'),
 ('really', 'appreciate'),
 ('successfully', 'transferred'),
 ('lat', '5480'),
 ('brand',

### Creating combined vocabulary of unigrams and bigrams

In [364]:
my_vocabulary = []
my_vocabulary.extend(my_word_df['keyword'].tolist()) 
my_vocabulary.extend(colloc_strings)

In [365]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(analyzer     ='word', ngram_range =(1, 2) ,vocabulary=my_vocabulary)
text_set = [str(chat).lower() for chat in data['Customer_chat']]
tf_idf = vec.fit_transform(text_set)
result_tfidf = pd.DataFrame(tf_idf.toarray(), columns=vec.get_feature_names())

In [366]:
result_tfidf

Unnamed: 0,system,battery,power,bios,computer,laptop,dispatch,update,center,warranty,...,slow performance,changes made,data loss,dell command,got everything,even though,opti 3070,software changes,boot menu,download option
0,0.000000,0.000000,0.0,0.297022,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.084890,0.147757,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.113225,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.045040,0.000000,0.0,0.079348,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.0,0.180197,0.0,0.176219,0.000000,0.17875,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.154081,0.000000,0.0,0.000000,0.0,0.000000,0.129788,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,0.172788,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
