#Article: [A Complete Exploratory Data Analysis and Visualization for Text Data](https://towardsdatascience.com/a-complete-exploratory-data-analysis-and-visualization-for-text-data-29fb1b96fb6a?gi=232cc1cacaca)

#Loading Data - Pizza Ordering



In [1]:
import nltk
import random
import pandas as pd
import plotly.express as px
from textblob import TextBlob
from google.colab import files
import plotly.graph_objects as go
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
uploaded = files.upload()

Saving pizza-ordering-tm-1-2019-concat.csv to pizza-ordering-tm-1-2019-concat.csv


In [3]:
data = pd.read_csv("pizza-ordering-tm-1-2019-concat.csv")
#data = pd.read_csv("woz-dialogs-pizza-ordering.csv")
#data = pd.read_csv("self-dialogs-pizza-ordering.csv")

#Data Head

In [4]:
data.head()

Unnamed: 0,text,segment,intent,speaker
0,I would also like to have extra cheese,extra cheese,pizza_ordering.preference.accept,USER
1,Do you mean veggie lovers?,veggie lovers,pizza_ordering.name.pizza.accept,ASSISTANT
2,Cool. Your order will be ready at your local B...,Bella Luna pizzeria,pizza_ordering.name.store.accept,ASSISTANT
3,Your order has been sent to Bella Luna and is ...,Bella Luna,pizza_ordering.name.store.accept,ASSISTANT
4,"So 3 medium pepperoni pizzas, with thin crust ...",onions,pizza_ordering.type.topping.accept,ASSISTANT


#Drop Useless Columns

In [5]:
data.drop('segment', axis=1, inplace=True)
data = data[~data['text'].isnull()]
data

Unnamed: 0,text,intent,speaker
0,I would also like to have extra cheese,pizza_ordering.preference.accept,USER
1,Do you mean veggie lovers?,pizza_ordering.name.pizza.accept,ASSISTANT
2,Cool. Your order will be ready at your local B...,pizza_ordering.name.store.accept,ASSISTANT
3,Your order has been sent to Bella Luna and is ...,pizza_ordering.name.store.accept,ASSISTANT
4,"So 3 medium pepperoni pizzas, with thin crust ...",pizza_ordering.type.topping.accept,ASSISTANT
...,...,...,...
17598,The site allows me to adjust the amount of che...,pizza_ordering.preference.accept,ASSISTANT
17599,"Ok, to confirm, you have ordered a large pizza...",pizza_ordering.size.pizza.accept,ASSISTANT
17600,"Oh, they are out of meatballs.",pizza_ordering.type.topping.reject,ASSISTANT
17601,"ok, so you are ordering 3 small thin crust piz...",pizza_ordering.size.pizza.accept,ASSISTANT


#Preprocessing text with TextBlob
*   Sentiment Polarity
*   Text Length
*   Word Count



In [None]:
def preprocess(Text):
    Text = Text.str.replace("(<br/>)", "")
    Text = Text.str.replace('(<a).*(>).*(</a>)', '')
    Text = Text.str.replace('(&amp)', '')
    Text = Text.str.replace('(&gt)', '')
    Text = Text.str.replace('(&lt)', '')
    Text = Text.str.replace('(\xa0)', ' ')
    return Text
    
data['text'] = preprocess(data['text'])
data['polarity'] = data['text'].map(lambda text: TextBlob(text).sentiment.polarity)
data['text_len'] = data['text'].astype(str).apply(len)
data['word_count'] = data['text'].apply(lambda x: len(str(x).split()))

In [7]:
data.head()

Unnamed: 0,text,intent,speaker,polarity,text_len,word_count
0,I would also like to have extra cheese,pizza_ordering.preference.accept,USER,0.0,38,8
1,Do you mean veggie lovers?,pizza_ordering.name.pizza.accept,ASSISTANT,-0.3125,26,5
2,Cool. Your order will be ready at your local B...,pizza_ordering.name.store.accept,ASSISTANT,0.1375,73,13
3,Your order has been sent to Bella Luna and is ...,pizza_ordering.name.store.accept,ASSISTANT,0.2,105,20
4,"So 3 medium pepperoni pizzas, with thin crust ...",pizza_ordering.type.topping.accept,ASSISTANT,-0.3,127,23


#Random Texts - Sentiment Polarity

In [9]:
print('======== 5 random texts with the highest positive sentiment polarity ========\n')
cl = data.loc[data.polarity == 1, ['text']].sample(5).values
for c in cl:
    print(c[0])


Great! If you need to contact Domino's Pizza about your pick-up, here is their number: *gives phone number+
Excellent, I have a stuffed crust pepperoni pizza and a stuffed crust sausage pizza. We also have a side of garlic knots. Would you like anything else?
Hey, I would like to order three delicious pizzas from Bella Luna, please.
The one in Astoria, OR. They have the best pizza.
Excellent, I have a stuffed crust pepperoni pizza and a stuffed crust sausage pizza. We also have a side of garlic knots. Would you like anything else?


In [10]:
print('======== 5 random texts with the most neutral sentiment(zero) polarity ========\n')
cl = data.loc[data.polarity == 0, ['text']].sample(5).values
for c in cl:
    print(c[0])


Italian Sausage with Pepperoni
I want the specialty pizza with the works and a all meat pizza
Hello there. I'd like to order a pizza from Bella Luna.
do u want the Howies on 301
let's switch to a meat lover include saussage, pepperoni, italian sausage, italian ham, extra tomatoes sauce and tomato slice, prosciuto


In [14]:
print('======== 2 texts with the most negative polarity ========\n')
cl = data.loc[data.polarity <= -0.5, ['text']].sample(2).values
for c in cl:
    print(c[0])


Unfortunately, the Westend location is out of pepperoni. Would you like another topping or try another location?
i want chicken, spinach and mushrooms on that one


#Graphic Analysis

##Accept/Reject Frequency

In [15]:
def acceptReject(intent):
  intent = intent.split(".")
  if intent[len(intent) - 1] == "accept":
    return "accept"
  else:
    return "reject"

df = data['intent'].map(lambda intent: acceptReject(intent))
fig = px.histogram(df, x = "intent")
fig.show()

##Accept | Reject Distribution By Intent

In [16]:
def excludeAccRej(intent):
  intent = intent.split(".")
  return ".".join(intent[0:len(intent) - 1])

df = pd.DataFrame(data = data, columns=['intent'])
df['acc_rej'] = df['intent'].apply(lambda x: acceptReject(x))
df['intent'] = df['intent'].apply(lambda x: excludeAccRej(x))
df

Unnamed: 0,intent,acc_rej
0,pizza_ordering.preference,accept
1,pizza_ordering.name.pizza,accept
2,pizza_ordering.name.store,accept
3,pizza_ordering.name.store,accept
4,pizza_ordering.type.topping,accept
...,...,...
17598,pizza_ordering.preference,accept
17599,pizza_ordering.size.pizza,accept
17600,pizza_ordering.type.topping,reject
17601,pizza_ordering.size.pizza,accept


In [17]:
x1 = df.loc[df['acc_rej'] == 'accept', 'intent']
x0 = df.loc[df['acc_rej'] == 'reject', 'intent']

trace1 = go.Histogram(
    x=x1, name='Accept',
    opacity=0.75
)

trace2 = go.Histogram(
    x=x0, name = 'Reject',
    opacity=0.75
)

data_trace = [trace1, trace2]
layout = go.Layout(barmode = 'group', title='Distribution of Accepts/Rejects Based on Intent')
fig = go.Figure(data=data_trace, layout=layout)

fig.show()

##Intent Frequency

In [18]:
fig = px.histogram(data, x = "intent")
fig.show()

##Polarity Frequence

In [19]:
#data['polarity'].plot(kind = 'hist', bins = 30)
fig = px.histogram(data, x = "polarity")
fig.show()

##Speaker Frequency

In [20]:
fig = px.histogram(data, x = "speaker")
fig.show()

##Text Length Frequency

In [21]:
fig = px.histogram(data, x = "text_len")
fig.show()

##Word Count Frequency

In [22]:
fig = px.histogram(data, x = "word_count")
fig.show()

#Uni/Bi/Trigram occurrence





##Unigram occurrence

In [23]:
def get_top_n_words(corpus, n = None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words(data['text'], 20)

for word, freq in common_words:
    print(word, freq)

and 16456
with 11185
pizza 8416
large 6390
the 6344
one 6262
pepperoni 5700
pizzas 5629
you 5419
cheese 5373
crust 5364
is 4683
extra 4501
order 4312
so 4131
to 4022
that 3856
for 3350
sausage 3287
ok 2953


In [24]:
data1 = pd.DataFrame(common_words, columns = ['text' , 'count'])
#data1.groupby('Text').sum()['Count'].sort_values(ascending=False).plot(kind='barh')
fig = px.histogram(data1, 
                   x="text", 
                   title='Unigram Frequency With Stop Words', 
                   y="count", 
                   histfunc='sum')
fig.show()

In [25]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words(data['text'], 20)

for word, freq in common_words:
    print(word, freq)

pizza 8416
large 6390
pepperoni 5700
pizzas 5629
cheese 5373
crust 5364
extra 4501
order 4312
sausage 3287
ok 2953
chicken 2462
medium 2448
correct 2336
bacon 2260
like 1967
lovers 1940
peppers 1917
want 1862
olives 1841
small 1788


In [26]:
data2 = pd.DataFrame(common_words, columns = ['text' , 'count'])
#data1.groupby('Text').sum()['Count'].sort_values(ascending=False).plot(kind='barh')
fig = px.histogram(data2, 
                   x="text", 
                   title='Unigram Frequency Without Stop Words', 
                   y="count", 
                   histfunc='sum')
fig.show()

##Bigram occurrence

In [27]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(data['text'], 20)

for word, freq in common_words:
    print(word, freq)

pizza with 2962
extra cheese 2807
pepperoni and 2306
with extra 2036
thin crust 1984
is that 1634
will be 1536
with pepperoni 1357
your order 1302
pizzas with 1233
gluten free 1196
you want 1187
that correct 1176
one large 1171
sausage and 1169
thick crust 1167
large pizzas 1119
and one 1071
pizzas one 1057
be ready 1042


In [28]:
data3 = pd.DataFrame(common_words, columns = ['text' , 'count'])
#data2.groupby('text').sum()['count'].sort_values(ascending=False).plot(kind = 'barh')
fig = px.histogram(data3, 
                   x="text", 
                   title='Bigram Frequency With Stop Words', 
                   y="count", 
                   histfunc='sum')
fig.show()

In [29]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(data['text'], 20)

for word, freq in common_words:
    print(word, freq)

extra cheese 2807
gluten free 1196
large pizzas 1120
small pizzas 983
crust pizzas 948
meat lovers 945
veggie lovers 940
order large 920
pepperoni sausage 909
green peppers 907
medium pizzas 904
large pizza 858
25 minutes 839
bella luna 838
free crust 794
pizzas pepperoni 782
black olives 779
ready pickup 602
chicken bbq 566
pizzas crust 558


In [30]:
data4 = pd.DataFrame(common_words, columns = ['text' , 'count'])
#data2.groupby('text').sum()['count'].sort_values(ascending=False).plot(kind = 'barh')
fig = px.histogram(data4, 
                   x="text", 
                   title='Bigram Frequency Without Stop Words', 
                   y="count", 
                   histfunc='sum')
fig.show()

##Trigram occurrence

In [31]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_trigram(data['text'], 20)

for word, freq in common_words:
    print(word, freq)

with extra cheese 1385
is that correct 1164
will be ready 911
gluten free crust 787
with pepperoni and 771
be ready for 748
and extra cheese 719
you are ordering 665
ready for pickup 593
large pizza with 557
in 25 minutes 542
with thin crust 484
extra cheese and 461
pizza with pepperoni 459
with thick crust 455
thin crust pizzas 443
like to order 433
with gluten free 425
ok you are 424
large thin crust 422


In [32]:
data5 = pd.DataFrame(common_words, columns = ['text' , 'count'])
fig = px.histogram(data5,
                   x = 'text',
                   y = 'count',
                   histfunc='sum',
                   title = 'Trigram Frequency With Stop Words')
fig.show()

In [33]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_trigram(data['text'], 20)

for word, freq in common_words:
    print(word, freq)

gluten free crust 794
pepperoni extra cheese 353
pickup 25 minutes 336
ready pickup 25 335
pizzas extra cheese 333
medium crust pizzas 247
pizza extra cheese 245
pizzas gluten free 240
large crust pizzas 232
extra cheese correct 231
pizzas ready pickup 226
order large pizzas 223
large pizzas pepperoni 219
small crust pizzas 217
gluten free pizzas 217
pizzas veggie lovers 206
small pizzas crust 204
large pizza pepperoni 199
medium gluten free 196
extra cheese pepperoni 195


In [34]:
data6 = pd.DataFrame(common_words, columns = ['text' , 'count'])
fig = px.histogram(data6,
                   x = 'text',
                   y = 'count',
                   title = 'Trigram Frequency Without Stop Words',
                   histfunc = 'sum')
fig.show()

###Part Of Speech Tagging (POS) 

In [35]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [36]:
texts = ""
for t in data['text']: 
  texts += (t + ". ")

In [None]:
texts.split(". ")

In [38]:
blob = TextBlob(texts.lower())
pos_data = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])

In [39]:
pos_data

Unnamed: 0,word,pos
0,i,NN
1,would,MD
2,also,RB
3,like,VB
4,to,TO
...,...,...
307218,all,DT
307219,meatlovers,NNS
307220,the,DT
307221,pizza,NN


In [40]:
fig = px.histogram(pos_data, x='pos')
fig.show()

Output hidden; open in https://colab.research.google.com to view.

##Speaker vs Sentiment Polarity - Average

In [41]:
fig = px.histogram(data, x = "speaker", y = "polarity", histfunc='avg')
fig.show()

##Intent vs. Sentiment Polarity

In [None]:
df = pd.DataFrame(data = data, columns=['intent', 'polarity'])
df['intent'] = df['intent'].apply(lambda x: acceptReject(x))
df

Unnamed: 0,intent,polarity
0,accept,0.000000
1,accept,-0.312500
2,accept,0.137500
3,accept,0.200000
4,accept,-0.300000
...,...,...
17598,accept,0.000000
17599,accept,0.357143
17600,reject,0.000000
17601,accept,-0.050000


In [None]:
def makeTrace(data, a, b):
  traces = []
  for i in data.intent.unique():
    c1 = random.randint(10, 200)
    c2 = random.randint(10, 200)
    c3 = random.randint(10, 200)
    color = "rgb({}, {}, {})".format(c1, c2, c3)
    trace = go.Box(
      y = data[data[a] == i][b],
      name = i,
      marker = dict(
        color = color
      )
    )
    traces.append(trace)
  return traces

In [None]:
data_trace = makeTrace(df, 'intent', 'polarity')

layout = go.Layout(
    title = "Sentiment Polarity Boxplot of Intent - Accept and Reject"
)

fig = go.Figure(data = data_trace, layout = layout)
fig.show()

In [None]:
fig = px.histogram(df, x = "intent", y = 'polarity', histfunc='avg', title="Intent Vs Sentiment Polarity - Average")
fig.show()

In [None]:
def traceAVsB(data, a, b):
  data_trace = []
  for i in data[a].unique():
    y_ = data[data[a] == i][b]
    c1 = random.randint(10, 200)
    c2 = random.randint(10, 200)
    c3 = random.randint(10, 200)
    color = "rgb({}, {}, {})".format(c1, c2, c3)
    trace = go.Box(
      y = y_,
      name = i,
      marker = dict(
        color = color,
      )
    )
    data_trace.append(trace)
  return data_trace

In [None]:
data_trace = traceAVsB(data, 'intent', 'polarity')

layout = go.Layout(
    title = "Sentiment Polarity Boxplot of Intent"
)

fig = go.Figure(data = data_trace, layout=layout)
fig.show()

In [None]:
def excludeAccRej(intent):
  intent = intent.split(".")
  return ".".join(intent[0:len(intent) - 1])

In [None]:
df = pd.DataFrame(data = data, columns=['intent', 'polarity'])
df['intent'] = df['intent'].apply(lambda x: excludeAccRej(x))
df

Unnamed: 0,intent,polarity
0,pizza_ordering.preference,0.000000
1,pizza_ordering.name.pizza,-0.312500
2,pizza_ordering.name.store,0.137500
3,pizza_ordering.name.store,0.200000
4,pizza_ordering.type.topping,-0.300000
...,...,...
17598,pizza_ordering.preference,0.000000
17599,pizza_ordering.size.pizza,0.357143
17600,pizza_ordering.type.topping,0.000000
17601,pizza_ordering.size.pizza,-0.050000


In [None]:
data_trace = makeTrace(df, 'intent', 'polarity')

layout = go.Layout(
    title = "Sentiment Polarity Boxplot of Intent Without Accept and Reject"
)

fig = go.Figure(data = data_trace, layout=layout)
fig.show()

In [None]:
trace1 = go.Scatter(
    x=data['polarity'], y=data['intent'].apply(lambda x: acceptReject(x)), mode='markers', name='points',
    marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
)

trace2 = go.Histogram2dContour(
    x=data['polarity'], y=data['intent'].apply(lambda x: acceptReject(x)), name='density', ncontours=20,
    colorscale='Hot', reversescale=True, showscale=False
)

trace3 = go.Histogram(
    x=data['polarity'], name='Sentiment polarity density',
    marker=dict(color='rgb(102,0,0)'),
    yaxis='y2'
)

trace4 = go.Histogram(
    y=data['intent'].apply(lambda x: acceptReject(x)), name='Intent density', marker=dict(color='rgb(102,0,0)'),
    xaxis='x2'
)

data_trace = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    showlegend=False,
    autosize=False,
    width=600,
    height=550,
    xaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    yaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    margin=dict(
        t=50
    ),
    hovermode='closest',
    bargap=0,
    xaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    ),
    yaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    )
)

fig = go.Figure(data=data_trace, layout=layout)
fig.show()

##Intent vs. Text Length

In [None]:
df = pd.DataFrame(data = data, columns=['intent', 'text_len'])
df['intent'] = df['intent'].apply(lambda x: acceptReject(x))

data_trace = makeTrace(df, 'intent', 'text_len')

layout = go.Layout(
    title = "Text Length Boxplot of Intent - Accept and Reject"
)

fig = go.Figure(data = data_trace, layout=layout)
fig.show()

In [None]:
df = pd.DataFrame(data = data, columns=['intent', 'text_len'])
df['intent'] = df['intent'].apply(lambda x: excludeAccRej(x))

data_trace = makeTrace(df, 'intent', 'text_len')

layout = go.Layout(
    title = "Text Length Boxplot of Intent"
)

fig = go.Figure(data = data_trace, layout=layout)
fig.show()

##Intent vs. Word Count

In [None]:
df = pd.DataFrame(data = data, columns=['intent', 'word_count'])
df['intent'] = df['intent'].apply(lambda x: acceptReject(x))

data_trace = makeTrace(df, 'intent', 'word_count')

layout = go.Layout(
    title = "Word Count Boxplot of Intent - Accept and Reject"
)

fig = go.Figure(data = data_trace, layout=layout)
fig.show()

In [None]:
df = pd.DataFrame(data = data, columns=['intent', 'word_count'])
df['intent'] = df['intent'].apply(lambda x: excludeAccRej(x))

data_trace = makeTrace(df, 'intent', 'word_count')

layout = go.Layout(
    title = "Word Count Boxplot of Intent"
)

fig = go.Figure(data = data_trace, layout=layout)
fig.show()