# Loading the dataset

In [40]:
import os

from collections import Counter
from textblob import TextBlob 

import numpy as np
import pandas as pd

import nltk
from nltk.probability import FreqDist

import plotly
import plotly.graph_objects as go
import plotly.figure_factory as ff

# Loading the data

In [9]:
df = pd.read_csv("markets.csv", error_bad_lines=False)

In [10]:
for column in ['description', 'paragraphs', 'headings', 'listings', 'link-text']:
    df[column] = df[column].apply(lambda x : eval(x))

In [11]:
df['paragraphs'].iloc[3280]

['card the world trust and safety - ctw escrow is important in situations where buyers and sellers do not know or trust each other and needs a way to guarantee payment security for their transaction. dismiss',
 'showing 73–74 of 74 results',
 'small orders are shipped inside magazines or binders, large orders are shipped in boxes with labeling to appear like an ebay or amazon.com package. additional stealth precautions are taken that we don’t publicly share. please let us know if you have special requirements.',
 '0 items',
 'items that are sent by email do not need to enter name, street etc. when ordering. you can enter xxx instead. usually you will receive an answer 1 hour later.',
 'checkout',
 '↑',
 'shipment time: your package will ship within 6 hours of receiving payment. dhl & ups ship monday-friday and not on holidays. we ship fedex mon-sat. we will mail fedex packages the same day if you send us payment before 2pm est.']

# Performing Sentimental Analysis using TextBob

Title

In [12]:
title_text = list(df["title"])
polarity = []
for title in title_text:
    polarity.append(TextBlob(str(title)).sentiment.polarity)

In [13]:
positive_polarity = [np.mean(instances) if instances>0 else 0.0 for instances in polarity]
negative_polarity = [np.mean(instances) if instances<0 else 0.0 for instances in polarity]

In [23]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Title Polarity, +ve polarity=' + str(round(np.mean(positive_polarity),4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity),4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
        )
plotly.offline.plot(fig, filename = os.path.join("../plots/", "title_polarity.pdf"), auto_open=False)

'../plots/title_polarity.pdf.html'

Description

In [24]:
description_text = list(df["description"])
polarity = []
for descriptions in description_text:
    temp_polarity = []
    if descriptions != None:
        for description in descriptions:
            temp_polarity.append(TextBlob(str(description)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

In [25]:
positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]


Mean of empty slice.


invalid value encountered in double_scalars



In [26]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Description Polarity, +ve polarity=' + str(round(np.mean(positive_polarity),4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity),4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/", "description_polarity.pdf"), auto_open=False)


Your filename `../plots/description_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/description_polarity.pdf.html'

Paragraphs

In [27]:
paragraphs_text = list(df["paragraphs"])
polarity = []
for paragraphss in paragraphs_text:
    temp_polarity = []
    if paragraphss != None:
        for paragraphs in paragraphss:
            temp_polarity.append(TextBlob(str(paragraphs)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Paragraphs Polarity, +ve polarity=' + str(round(np.mean(positive_polarity), 4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity),4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/", "paragraph_polarity.pdf"), auto_open=False)


Your filename `../plots/paragraph_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/paragraph_polarity.pdf.html'

Headings

In [28]:
headings_text = list(df["headings"])
polarity = []
for headings in headings_text:
    temp_polarity = []
    if headings != None:
        for heading in headings:
            temp_polarity.append(TextBlob(str(heading)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Headings Polarity, +ve polarity=' + str(round(np.mean(positive_polarity),4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity), 4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/", "heading_polarity.pdf"), auto_open=False)


Your filename `../plots/heading_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/heading_polarity.pdf.html'

listings

In [29]:
listings_text = list(df["listings"])
polarity = []
for listings in listings_text:
    temp_polarity = []
    if listings != None:
        for listing in listings:
            temp_polarity.append(TextBlob(str(listing)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Listings Polarity, +ve polarity=' + str(round(np.mean(positive_polarity),4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity),4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/", "lisiting_polarity.pdf"), auto_open=False)


Your filename `../plots/lisiting_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/lisiting_polarity.pdf.html'

Link text

In [30]:
link_texts_text = list(df["link-text"])
polarity = []
for link_texts in link_texts_text:
    temp_polarity = []
    if link_texts != None:
        for link_text in link_texts:
            temp_polarity.append(TextBlob(str(link_text)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Link_texts Polarity, +ve polarity=' + str(round(np.mean(positive_polarity), 4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity), 4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/", "link_polarity.pdf"), auto_open=False)


Your filename `../plots/link_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/link_polarity.pdf.html'

# Average Sentence length Analysis

title

In [32]:
title_text = list(df['title'])
title_text_len = np.array([len(str(text).split(" ")) if text!="None" else 0 for text in title_text])
mean_title = np.mean([value for value in title_text_len if value!=0])
std_title = np.std([value for value in title_text_len if value!=0])

In [33]:
fig = ff.create_distplot([title_text_len], ['title'], curve_type='normal')

# Add title
fig.update_layout(
                    title_text='Title length with the mean of : ' + str(round(mean_title,2)) + " and std dev of : " + str(round(std_title,2)),
                    xaxis_title="No of Web pages",
                    yaxis_title="Title length"
        )
plotly.offline.plot(fig, filename = os.path.join("../plots/length/", "title_len.pdf"), auto_open=False)


Your filename `../plots/sentiment/title_len.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/sentiment/title_len.pdf.html'

paragraphs

In [34]:
paragraph_text = list(df['paragraphs'])
paragraphs_text_length = []
for paragraphs in paragraph_text:
    temp_para_len_list = []
    if paragraphs != None:
        for para in paragraphs:
            para = para.split(" ")
            temp_para_len_list.append(len(para))
    else:
        temp_para_len_list.append(0)
    paragraphs_text_length.append(temp_para_len_list)

In [35]:
paragraphs_text_length = np.array([item for sublist in paragraphs_text_length for item in sublist])
mean_paragraph = np.mean([value for value in paragraphs_text_length if value!=0])
std_paragraph = np.std([value for value in paragraphs_text_length if value!=0])

In [36]:
fig = ff.create_distplot([paragraphs_text_length], ['paragraphs'], curve_type='normal')

# Add title
fig.update_layout(title_text='Sentence length with the mean of : ' + str(round(mean_paragraph,2)) + " and std dev of : " + str(round(std_paragraph,2)))
plotly.offline.plot(fig, filename = os.path.join("../plots/length/", "paragraphs_len.pdf"), auto_open=False)


Your filename `../plots/sentiment/paragraphs_len.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/sentiment/paragraphs_len.pdf.html'

headings

In [37]:
heading_text = list(df['headings'])
headings_text_length = []
for headings in heading_text:
    temp_para_len_list = []
    if headings != None:
        for para in headings:
            para = para.split(" ")
            temp_para_len_list.append(len(para))
    else:
        temp_para_len_list.append(0)
    headings_text_length.append(temp_para_len_list)

headings_text_length = np.array([item for sublist in headings_text_length for item in sublist])
mean_heading = np.mean([value for value in headings_text_length if value!=0])
std_heading = np.std([value for value in headings_text_length if value!=0])

fig = ff.create_distplot([headings_text_length], ['headings'], curve_type='normal')

# Add title
fig.update_layout(title_text='Sentence length with the mean of : ' + str(round(mean_heading,2)) + " and std dev of : " + str(round(std_heading,2)))
plotly.offline.plot(fig, filename = os.path.join("../plots/length/", "headings_len.pdf"), auto_open=False)


Your filename `../plots/sentiment/headings_len.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/sentiment/headings_len.pdf.html'

Listings

In [38]:
listing_text = list(df['listings'])
listings_text_length = []
for listings in listing_text:
    temp_para_len_list = []
    if listings != None:
        for para in listings:
            para = para.split(" ")
            temp_para_len_list.append(len(para))
    else:
        temp_para_len_list.append(0)
    listings_text_length.append(temp_para_len_list)


listings_text_length = np.array([item for sublist in listings_text_length for item in sublist])
mean_listing = np.mean([value for value in listings_text_length if value!=0])
std_listing = np.std([value for value in listings_text_length if value!=0])

fig = ff.create_distplot([listings_text_length], ['listings'], curve_type='normal')

# Add title
fig.update_layout(title_text='Sentence length with the mean of : ' + str(round(mean_listing,2)) + " and std dev of : " + str(round(std_listing,2)))
plotly.offline.plot(fig, filename = os.path.join("../plots/length/", "listing_len.pdf"), auto_open=False)


Your filename `../plots/sentiment/listing_len.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/sentiment/listing_len.pdf.html'

Link text

In [39]:
link_text = list(df['link-text'])
links_text_length = []
for links in link_text:
    temp_para_len_list = []
    if links != None:
        for para in links:
            para = para.split(" ")
            temp_para_len_list.append(len(para))
    else:
        temp_para_len_list.append(0)
    links_text_length.append(temp_para_len_list)


links_text_length = np.array([item for sublist in links_text_length for item in sublist])
mean_link = np.mean([value for value in links_text_length if value!=0])
std_link = np.std([value for value in links_text_length if value!=0])

fig = ff.create_distplot([links_text_length], ['links'], curve_type='normal')

# Add title
fig.update_layout(title_text='Sentence length with the mean of : ' + str(round(mean_link,2)) + " and std dev of : " + str(round(std_link,2)))
plotly.offline.plot(fig, filename = os.path.join("../plots/length/", "link_len.pdf"), auto_open=False)


Your filename `../plots/sentiment/link_len.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/sentiment/link_len.pdf.html'

# Term Frequency Analysis

Title

In [48]:
title_text = list(df['title'])
title_text_tokenize = [str(title).split(" ") for title in title_text if title!="None"]

In [65]:
tokens = [item for sublist in title_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

In [81]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="blue"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/term-frequency", "title.pdf"), auto_open=False)


Your filename `../plots/term-frequency/title.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/term-frequency/title.pdf.html'

Description

In [82]:
description_text = list(df['description'])
description_text_tokenize = []
for descriptions in description_text:
    temp_description_list = []
    if descriptions != None:
        for des in descriptions:
            des = des.split(" ")
            description_text_tokenize.append(des)
    else:
        pass

In [83]:
tokens = [item for sublist in description_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

In [84]:
analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

In [85]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="blue"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/term-frequency", "description.pdf"), auto_open=False)


Your filename `../plots/term-frequency/description.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/term-frequency/description.pdf.html'

Paragraphs

In [86]:
paragraph_text = list(df['paragraphs'])
paragraph_text_tokenize = []
for paragraphs in paragraph_text:
    temp_paragraph_list = []
    if paragraphs != None:
        for value in paragraphs:
            value = value.split(" ")
            paragraph_text_tokenize.append(value)
    else:
        pass

tokens = [item for sublist in paragraph_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="blue"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/term-frequency", "paragraphs.pdf"), auto_open=False)


Your filename `../plots/term-frequency/paragraphs.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/term-frequency/paragraphs.pdf.html'

Headings

In [87]:
heading_text = list(df['headings'])
heading_text_tokenize = []
for headings in heading_text:
    temp_heading_list = []
    if headings != None:
        for value in headings:
            value = value.split(" ")
            heading_text_tokenize.append(value)
    else:
        pass

tokens = [item for sublist in heading_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="blue"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/term-frequency", "headings.pdf"), auto_open=False)


Your filename `../plots/term-frequency/headings.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/term-frequency/headings.pdf.html'

Lisitings

In [88]:
listing_text = list(df['listings'])
listing_text_tokenize = []
for listings in listing_text:
    temp_listing_list = []
    if listings != None:
        for value in listings:
            value = value.split(" ")
            listing_text_tokenize.append(value)
    else:
        pass

tokens = [item for sublist in listing_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="blue"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/term-frequency", "listings.pdf"), auto_open=False)


Your filename `../plots/term-frequency/listings.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/term-frequency/listings.pdf.html'

Links

In [78]:
link_text = list(df['link-text'])
link_text_tokenize = []
for links in link_text:
    temp_link_list = []
    if links != None:
        for value in links:
            value = value.split(" ")
            link_text_tokenize.append(value)
    else:
        pass

tokens = [item for sublist in link_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="blue"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/term-frequency", "links.pdf"), auto_open=False)


Your filename `../plots/term-frequency/links.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/term-frequency/links.pdf.html'