# DATA EXPLORATION 

Installing dependecies

In [None]:
!pip install autocorrect
!pip install wordcloud

Loading dependecies

In [11]:
from data_exploring.data_exploring_functions import *
import plotly.express as px
import nltk
from wordcloud import WordCloud
import os

# <strong>Loading and cleaning data</strong>

In [2]:
#File names and base path.
file_satisfaction = 'satisfaction_ratings'
file_response = 'NPS_responses'
base_datapath = os.path.join(os.getcwd(),'data_exploring','data')

#Loading the data using a function from data_exploring_functions. Spell correction, lower letters and elimination of stopwrods is done in 
#this step. It returns the loaded dataset with an extra column named "processed comment"
data_satisfaction = loading_data(file_satisfaction,base_datapath,'csv')
data_responses = loading_data(file_response,base_datapath,'csv')

Data visualization

In [3]:
data_satisfaction.head(3)

Unnamed: 0.1,Unnamed: 0,Requester,User Id,Email,Ticket Id,Brand,Group,Assignee,Satisfaction,Comment,Survey Date,processed comment
0,0,Carlos Enrique brito,401415540751,carlosenrique1989n@hotmail.com,595380,OFFCORSS,Soporte OFFCORSS,CTS Transporte,good,,2020-08-12T10:32:10-05:00,
1,1,Lizeth Herrera,401737616791,lkhr328@gmail.com,600778,OFFCORSS,Soporte OFFCORSS,Devoluciones,good,,2020-08-12T10:32:09-05:00,
2,2,Carolina Sanchez,401569843672,caritoss1@hotmail.com,598181,OFFCORSS,Soporte OFFCORSS,CTS Transporte,bad,Me parece malo porque llame a explicar el prob...,2020-08-12T10:29:05-05:00,parece malo llame explicar problema compra 5 m...


In [4]:
data_responses.head(3)

Unnamed: 0.1,Unnamed: 0,Survey Date,Name,User Id,Email,Rating,Classification,Comment,Response Date,processed comment
0,0,2020-08-10,Johanna Vargas T,400842393092,vhannyt@gmail.com,7,passive,,2020-08-10 11:38,
1,1,2020-08-10,Maria Carolina Parra Rincón,400932763371,mariacarolinaparrar@gmail.com,1,detractor,Atender a las reclamaciones a tiempo para evit...,2020-08-10 11:59,atender reclamaciones tiempo evitar recibir me...
2,2,2020-08-10,Luz Marina González Pulido,400646948972,14a793667beb4637bc67b25241ee1150@ct.vtex.com.br,0,detractor,"No leen con atención, por favor dictar capacit...",2020-08-10 12:11,"leen atención , favor dictar capacitaciones co..."


<strong>Understanding repeated tickets Id </strong>

In [5]:
print('there are ' + str(len(data_satisfaction['Ticket Id'].unique())) + ' unique tickets')
print('there are ' + str(len(data_satisfaction)) + ' tickets')
duplicate_tickets = data_satisfaction.groupby('Ticket Id').size().sort_values(ascending=False).reset_index(name ='tickets count')
duplicate_example = data_satisfaction[data_satisfaction['Ticket Id'] == duplicate_tickets['Ticket Id'][3]]
print('---------------------------')
print('---------------------------')
print('Example of duplicate Tickets')
duplicate_example

there are 2657 unique tickets
there are 2774 tickets
---------------------------
---------------------------
Example of duplicate Tickets


Unnamed: 0.1,Unnamed: 0,Requester,User Id,Email,Ticket Id,Brand,Group,Assignee,Satisfaction,Comment,Survey Date,processed comment
2254,2254,leidy huertas,400578886512,leidy.kari96@gmail.com,583147,OFFCORSS,Soporte OFFCORSS,CTS Transporte,good,BUENO POR QUE ME RESPONDIERON UN POCO TIEMPO Y...,2020-07-16T14:12:28-05:00,bueno respondieron tiempo dieron solucion
2255,2255,leidy huertas,400578886512,leidy.kari96@gmail.com,583147,OFFCORSS,Soporte OFFCORSS,CTS Transporte,good,BUENO POR QUE ME RESPONDIERON UN POCO TIEMPO Y...,2020-07-16T14:12:00-05:00,bueno respondieron tiempo dieron solucion
2256,2256,leidy huertas,400578886512,leidy.kari96@gmail.com,583147,OFFCORSS,Soporte OFFCORSS,CTS Transporte,good,,2020-07-16T14:11:36-05:00,


The repeated ticket Id could be for repeated comments, or for tracking the steps in a request. Therefore are eliminated the repeated tickets Ids with repeated comments.

In [6]:
data_satisfaction.drop_duplicates(subset=['Ticket Id', 'processed comment'], inplace = True)

<strong> Missing values analysis </strong>. It is found the percentage of missing comments, and they are replaced with empty values.

In [7]:
data_satisfaction['processed comment'].isna().sum() 
percent_nan = 100*data_satisfaction['processed comment'].isna().sum() / data_satisfaction['processed comment'].isna().count()
print('Percentage of empty comments for satisfaction data is ' , round(percent_nan,2),'%')
data_responses['processed comment'].isna().sum()
percent_nan = 100*data_responses['processed comment'].isna().sum() / data_responses['processed comment'].isna().count()
print('Percentage of empty comments for responses data is ' , round(percent_nan,2),'%')

Percentage of empty comments for satisfaction data is  61.3 %
Percentage of empty comments for responses data is  48.47 %


In [8]:
data_satisfaction['processed comment'] = data_satisfaction['processed comment'].fillna('')
data_responses['processed comment'] = data_responses['processed comment'].fillna('')

# <strong> Analysis of comments by groups </strong>

In [9]:
# Select comments by classification
satisfaction_good = data_satisfaction[(data_satisfaction['Satisfaction'] == 'good')]['processed comment']
satisfaction_bad = data_satisfaction[(data_satisfaction['Satisfaction'] == 'bad')]['processed comment']

responses_promoter = data_responses[(data_responses['Classification'] == 'promoter')]['processed comment']
responses_passive = data_responses[(data_responses['Classification'] == 'passive')]['processed comment']
responses_detractor = data_responses[(data_responses['Classification'] == 'detractor')]['processed comment']

<strong> Data Satisfaction good-bad N-grams analysis </strong>

In [41]:
# Use the get_top_n_words function from data_exploring_functions (Natesh function) to get the most frequent n-grams
# 1- gram for satisfaction_good
common_words = get_top_n_words(satisfaction_good, 20,1)
grams_df = pd.DataFrame(common_words, columns = ['word_satgood_1gram' , 'freq_satgood_1gram'])
fig =  px.bar(grams_df, x = 'word_satgood_1gram', y ='freq_satgood_1gram', title='Top 20 1-gram from good satisfaction comments', 
             labels={'freq_satgood_1gram':'Frequency', 'word_satgood_1gram':'1-gram'})
fig.show()

# 2- gram for satisfaction_good
common_words = get_top_n_words(satisfaction_good, 20,2)
grams_df['word_satgood_2gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_satgood_2gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_satgood_2gram', y ='freq_satgood_2gram', title='Top 20 2-grams from good satisfaction comments', 
             labels={'word_satgood_2gram':'Frequency', 'word_satgood_2gram':'2-gram'})
fig.show()

# 3- gram for satisfaction_good
common_words = get_top_n_words(satisfaction_good, 20,3)
grams_df['word_satgood_3gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_satgood_3gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_satgood_3gram', y ='freq_satgood_3gram', title='Top 20 3-grams from good satisfaction comments', 
             labels={'word_satgood_3gram':'Frequency', 'word_satgood_3gram':'3-gram'})
fig.show()

In [44]:
# Use the get_top_n_words function from data_exploring_functions (Natesh function) to get the most frequent n-grams
# 1- gram for satisfaction_bad
common_words = get_top_n_words(satisfaction_bad, 20,1)
grams_df['word_satbad_1gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_satbad_1gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_satbad_1gram', y ='freq_satbad_1gram', title='Top 20 1-gram from bad satisfaction comments', 
             labels={'freq_satbad_1gram':'Frequency', 'word_satbad_1gram':'1-gram'})
fig.show()

# 2- gram for satisfaction_bad
common_words = get_top_n_words(satisfaction_bad, 20,2)
grams_df['word_satbad_2gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_satbad_2gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_satbad_2gram', y ='freq_satbad_2gram', title='Top 20 2-grams from bad satisfaction comments', 
             labels={'freq_satbad_2gram':'Frequency', 'word_satbad_2gram':'2-gram'})
fig.show()

# 3- gram for satisfaction_bad
common_words = get_top_n_words(satisfaction_bad, 20,3)
grams_df['word_satbad_3gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_satbad_3gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_satbad_3gram', y ='freq_satbad_3gram', title='Top 20 3-grams from bad satisfaction comments', 
             labels={'freq_satbad_3gram':'Frequency', 'word_satbad_3gram':'3-gram'})
fig.show()

<strong> Data Response promoter-passive-detractor N-grams analysis </strong>

In [46]:
# Use the get_top_n_words function from data_exploring_functions (Natesh function) to get the most frequent n-grams
# 1- gram for response promoter
common_words = get_top_n_words(responses_promoter, 20,1)
grams_df['word_respromoter_1gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_respromoter_1gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_respromoter_1gram', y ='freq_respromoter_1gram', title='Top 20 1-gram from promoter in response data comments', 
             labels={'freq_respromoter_1gram':'Frequency', 'word_respromoter_1gram':'1-gram'})
fig.show()

# 2- gram for response promoter
common_words = get_top_n_words(responses_promoter, 20,2)
grams_df['word_respromoter_2gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_respromoter_2gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_respromoter_2gram', y ='freq_respromoter_2gram', title='Top 20 2-grams from promoter  response data comments', 
             labels={'freq_respromoter_2gram':'Frequency', 'word_respromoter_2gram':'2-gram'})
fig.show()

# 3- gram for response promoter
common_words = get_top_n_words(responses_promoter, 20,3)
grams_df['word_respromoter_3gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_respromoter_3gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_respromoter_3gram', y ='freq_respromoter_3gram', title='Top 20 3-grams from promoter  response data comments', 
             labels={'freq_respromoter_3gram':'Frequency', 'word_respromoter_3gram':'3-gram'})
fig.show()

In [48]:
# Use the get_top_n_words function from data_exploring_functions (Natesh function) to get the most frequent n-grams
# 1- gram for response passive
common_words = get_top_n_words(responses_passive, 20,1)
grams_df['word_respassive_1gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_respassive_1gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_respassive_1gram', y ='freq_respassive_1gram', title='Top 20 1-gram from passive in response data comments', 
             labels={'freq_respassive_1gram':'Frequency', 'word_respassive_1gram':'1-gram'})
fig.show()

# 2- gram for response passive
common_words = get_top_n_words(responses_passive, 20,2)
grams_df['word_respassive_2gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_respassive_2gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_respassive_2gram', y ='freq_respassive_2gram', title='Top 20 2-grams from passive response data comments', 
             labels={'freq_respassive_2gram':'Frequency', 'word_respassive_2gram':'2-gram'})
fig.show()

# 3- gram for response passive
common_words = get_top_n_words(responses_passive, 20,3)
grams_df['word_respassive_3gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_respassive_3gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_respassive_3gram', y ='freq_respassive_3gram', title='Top 20 3-grams from passive  response data comments', 
             labels={'freq_respassive_3gram':'Frequency', 'word_respassive_3gram':'3-gram'})
fig.show()

In [49]:
# Use the get_top_n_words function from data_exploring_functions (Natesh function) to get the most frequent n-grams
# 1- gram for response detractor
common_words = get_top_n_words(responses_detractor, 20,1)
grams_df['word_resdetractor_1gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_resdetractor_1gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_resdetractor_1gram', y ='freq_resdetractor_1gram', title='Top 20 1-gram from detractor in response data comments', 
             labels={'freq_resdetractor_1gram':'Frequency', 'word_resdetractor_1gram':'1-gram'})
fig.show()

# 2- gram for response detractor
common_words = get_top_n_words(responses_detractor, 20,2)
grams_df['word_resdetractor_2gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_resdetractor_2gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_resdetractor_2gram', y ='freq_resdetractor_2gram', title='Top 20 2-grams from detractor response data comments', 
             labels={'freq_resdetractor_2gram':'Frequency', 'word_resdetractor_2gram':'2-gram'})
fig.show()

# 3- gram for response detractor
common_words = get_top_n_words(responses_detractor, 20,3)
grams_df['word_resdetractor_3gram'] = [tuple_word[0] for tuple_word in common_words]
grams_df['freq_resdetractor_3gram'] = [tuple_word[1] for tuple_word in common_words]
fig =  px.bar(grams_df, x = 'word_resdetractor_3gram', y ='freq_resdetractor_3gram', title='Top 20 3-grams from detractor  response data comments', 
             labels={'freq_resdetractor_3gram':'Frequency', 'word_resdetractor_3gram':'3-gram'})
fig.show()

In [52]:
# Saving the n-grams dataframe.
grams_df.to_csv(os.path.join(base_datapath,'grams.csv'))

<strong> Analysis by organization groups </strong>

In [61]:
group_count = data_satisfaction.groupby(['Group','Satisfaction'])['Ticket Id'].count().reset_index(name='Count')
px.bar(group_count, x ='Group', y= 'Count', color = 'Satisfaction' )

Finding the most frequently 1-grams by organization groups

In [164]:
uniq_groups = data_satisfaction.Group.unique()
n = 5
words_group = []
for group in uniq_groups:
    common_words = get_top_n_words(data_satisfaction[(data_satisfaction['Group'] == group) & (data_satisfaction['Satisfaction'] == 'good')]['processed comment'], n, 1)
    for word in common_words:
        words_group.append([word[0], word[1], group, 'good'])
    common_words = get_top_n_words(data_satisfaction[(data_satisfaction['Group'] == group) & (data_satisfaction['Satisfaction'] == 'bad')]['processed comment'], n, 1)
    for word in common_words:
        words_group.append([word[0], word[1], group, 'bad'])    
words_1gram_group = pd.DataFrame(words_group, columns = ['1gram','Freq','Group','Satisfaction'])

conver_dict = {'Call Center': 'cc', 'Soporte OFFCORSS': 'sop', 'Tienda Virtual' : 'tvir', 'Venta Directa': 'vdir'}
words_1gram_group['id'] = words_1gram_group['Group'].apply(lambda x: conver_dict[x])

fig = px.bar(words_1gram_group, x = '1gram', y = 'Freq', color = 'Satisfaction', facet_row='id', barmode='group')
fig.show()

words_1gram_group.to_csv(os.path.join(base_datapath,'1gram_organizationgroups.csv'))

Finding the most frequently 2-grams by organization groups

In [165]:
uniq_groups = data_satisfaction.Group.unique()
n = 5
words_group = []
for group in uniq_groups:
    common_words = get_top_n_words(data_satisfaction[(data_satisfaction['Group'] == group) & (data_satisfaction['Satisfaction'] == 'good')]['processed comment'], n, 2)
    for word in common_words:
        words_group.append([word[0], word[1], group, 'good'])
    common_words = get_top_n_words(data_satisfaction[(data_satisfaction['Group'] == group) & (data_satisfaction['Satisfaction'] == 'bad')]['processed comment'], n, 2)
    for word in common_words:
        words_group.append([word[0], word[1], group, 'bad'])    
words_2gram_group = pd.DataFrame(words_group, columns = ['2gram','Freq','Group','Satisfaction'])

conver_dict = {'Call Center': 'cc', 'Soporte OFFCORSS': 'sop', 'Tienda Virtual' : 'tvir', 'Venta Directa': 'vdir'}
words_2gram_group['id'] = words_1gram_group['Group'].apply(lambda x: conver_dict[x])

fig = px.bar(words_2gram_group, x = '2gram', y = 'Freq', color = 'Satisfaction', facet_row='id', barmode='group')
fig.show()
words_2gram_group.to_csv(os.path.join(base_datapath,'2gram_organizationgroups.csv'))

# Time analysis

Now we plot the number of comments and words per day in the data satisfaction data.

In [145]:
data_satisfaction['datetime'] = pd.to_datetime(data_satisfaction['Survey Date'])
data_satisfaction['words'] = data_satisfaction['processed comment'].apply(lambda x : len(nltk.word_tokenize(x)))
comments_day = data_satisfaction.groupby(by = data_satisfaction['datetime'].dt.to_period("D"))[['Ticket Id','words']].agg({'Ticket Id':'count', 'words':'sum'}).reset_index().rename(columns={'Ticket Id': 'Count of Comments', 'words': 'Count of words'})
fig = px.line(comments_day, y = ['Count of Comments', 'Count of words'], x = comments_day['datetime'].dt.to_timestamp(), title = 'Comments and words over time')
fig.show()

And we plot the good and bad comments per day

In [150]:
comments_day = data_satisfaction.groupby(by = [data_satisfaction['Satisfaction'],data_satisfaction['datetime'].dt.to_period("D")])[['Ticket Id','words']].agg({'Ticket Id':'count', 'words':'sum'}).reset_index().rename(columns={'Ticket Id': 'Count of Comments', 'words': 'Count of words'})
fig = px.bar(comments_day, y = 'Count of Comments', x = comments_day['datetime'].dt.to_timestamp(), title = 'Comments  over time', color = 'Satisfaction')
fig.show()

In [163]:
data_responses['datetime'] = pd.to_datetime(data_responses['Response Date'])
data_responses['words'] = data_responses['processed comment'].apply(lambda x : len(nltk.word_tokenize(x)))
comments_day = data_responses.groupby(by = data_responses['datetime'].dt.to_period("H"))[['Unnamed: 0','Rating','words']].agg({'Unnamed: 0':'count', 'Rating' : 'mean','words':'sum'}).reset_index().rename(columns={'Unnamed: 0': 'Count of Comments', 'words': 'Count of words'})
fig = px.line(comments_day, y = ['Count of Comments','Count of words'], x = comments_day['datetime'].dt.to_timestamp(), title = 'Behavior over time')
fig.show()
fig = px.line(comments_day, y = ['Rating'], x = comments_day['datetime'].dt.to_timestamp(), title = 'Rating over time')
fig.show()
fig = px.histogram(comments_day, x = ['Rating'], nbins = 50, title = 'Histogram of rating')
fig.show()