# Loading the dataset

In [6]:
import os

from collections import Counter
from textblob import TextBlob 

import numpy as np
import pandas as pd

import nltk
from nltk.probability import FreqDist

import plotly
import plotly.graph_objects as go
import plotly.figure_factory as ff

# Loading the data

In [7]:
df = pd.read_csv("forums.csv", error_bad_lines=False)

In [8]:
for column in ['description', 'paragraphs', 'headings', 'listings', 'link-text']:
    df[column] = df[column].apply(lambda x : eval(x))

Paragraphs

In [17]:
paragraphs = list(df['paragraphs'])
paragraphs = [para for para in paragraphs if para!=None]
print("No of paragraphs:", len(paragraphs))

# Cleaning paragraphs
paragraphs = [[para for para in paras if "»" not in para] for paras in paragraphs]
paragraphs = [item for sublist in paragraphs for item in sublist]
paragraphs = [para.split(" ") for para in paragraphs]
paragraphs_len = [len(para) for para in paragraphs]

No of paragraphs: 937


In [5]:
paragraphs

[['return', 'to', '“art,', 'photoshopped,', 'shotacon,', '3d”'],
 ['skip', 'to', 'content'],
 ['where', 'boys', 'go', 'for', 'a', 'good', 'time'],
 ['powered',
  'by',
  'phpbb:registered:',
  'forum',
  'software',
  ':copyright:',
  'phpbb',
  'limited'],
 ['privacy|terms'],
 ['skip', 'to', 'content'],
 ['where', 'boys', 'go', 'for', 'a', 'good', 'time'],
 ['return', 'to', 'board', 'index'],
 ['powered',
  'by',
  'phpbb:registered:',
  'forum',
  'software',
  ':copyright:',
  'phpbb',
  'limited'],
 ['privacy|terms'],
 ['you',
  'cannot',
  'post',
  'new',
  'topics',
  'in',
  'this',
  'forumyou',
  'cannot',
  'reply',
  'to',
  'topics',
  'in',
  'this',
  'forumyou',
  'cannot',
  'edit',
  'your',
  'posts',
  'in',
  'this',
  'forumyou',
  'cannot',
  'delete',
  'your',
  'posts',
  'in',
  'this',
  'forum'],
 ['skip', 'to', 'content'],
 ['where', 'boys', 'go', 'for', 'a', 'good', 'time'],
 ['return', 'to', 'board', 'index'],
 ['powered',
  'by',
  'phpbb:registered:',


In [26]:
Counter(paragraphs_len)

Counter({6: 191,
         3: 1099,
         7: 941,
         8: 997,
         1: 1113,
         4: 456,
         29: 157,
         59: 176,
         17: 51,
         5: 17,
         40: 4,
         16: 8,
         306: 22,
         13: 12,
         455: 9,
         19: 1,
         23: 5,
         45: 1})

In [18]:
paragraphs = [" ".join(para) for para in paragraphs]
paragraphs = [para for para in paragraphs if "• total topics" not in para]

In [19]:
print("Number of paragraphs left:", len(set(paragraphs)))
set(paragraphs)

Number of paragraphs left: 74


{'are you sure you want to delete all cookies set by this board?',
 'as a forum user you must comply with boystown. by doing that, we can provide a smooth, fun and productive experience for all of our community. failure to do so may result in temporary or permanent exclusion of your account.',
 'by accessing “boystown” (hereinafter “we”, “us”, “our”, “boystown”, “http://2tgix56pui5j63y7bq4bgeekjy4mw57zrnbvuvic2ncbt5gyxei7dcqd.onion”), you agree to be legally bound by the following terms. if you do not agree to be legally bound by all of the following terms then please do not access and/or use “boystown”. we may change these at any time and we’ll do our utmost in informing you, though it would be prudent to review this regularly yourself as your continued usage of “boystown” after changes mean you agree to be legally bound by these terms as they are updated and/or amended.our forums are powered by phpbb (hereinafter “they”, “them”, “their”, “phpbb software”, “www.phpbb.com”, “phpbb limi

Links

In [22]:
links = list(df['link-text'])

# Performing Sentimental Analysis using TextBob

Title

In [13]:
title_text = list(df["title"])
polarity = []
for title in title_text:
    polarity.append(TextBlob(str(title)).sentiment.polarity)

In [14]:
positive_polarity = [np.mean(instances) if instances>0 else 0.0 for instances in polarity]
negative_polarity = [np.mean(instances) if instances<0 else 0.0 for instances in polarity]

In [15]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Title Polarity, +ve polarity=' + str(round(np.mean(positive_polarity),4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity),4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
        )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/sentiment/", "title_polarity.pdf"), auto_open=False)


Your filename `../plots/forums/sentiment/title_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/sentiment/title_polarity.pdf.html'

Description

In [16]:
description_text = list(df["description"])
polarity = []
for descriptions in description_text:
    temp_polarity = []
    if descriptions != None:
        for description in descriptions:
            temp_polarity.append(TextBlob(str(description)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

In [17]:
positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]

In [18]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Description Polarity, +ve polarity=' + str(round(np.mean(positive_polarity),4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity),4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/sentiment/", "description_polarity.pdf"), auto_open=False)


Your filename `../plots/forums/sentiment/description_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/sentiment/description_polarity.pdf.html'

Paragraphs

In [19]:
paragraphs_text = list(df["paragraphs"])
polarity = []
for paragraphss in paragraphs_text:
    temp_polarity = []
    if paragraphss != None:
        for paragraphs in paragraphss:
            temp_polarity.append(TextBlob(str(paragraphs)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Paragraphs Polarity, +ve polarity=' + str(round(np.mean(positive_polarity), 4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity),4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/sentiment/", "paragraph_polarity.pdf"), auto_open=False)


Your filename `../plots/forums/sentiment/paragraph_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/sentiment/paragraph_polarity.pdf.html'

Headings

In [20]:
headings_text = list(df["headings"])
polarity = []
for headings in headings_text:
    temp_polarity = []
    if headings != None:
        for heading in headings:
            temp_polarity.append(TextBlob(str(heading)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Headings Polarity, +ve polarity=' + str(round(np.mean(positive_polarity),4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity), 4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/sentiment/", "heading_polarity.pdf"), auto_open=False)


Your filename `../plots/forums/sentiment/heading_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/sentiment/heading_polarity.pdf.html'

listings

In [21]:
listings_text = list(df["listings"])
polarity = []
for listings in listings_text:
    temp_polarity = []
    if listings != None:
        for listing in listings:
            temp_polarity.append(TextBlob(str(listing)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Listings Polarity, +ve polarity=' + str(round(np.mean(positive_polarity),4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity),4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/sentiment/", "lisiting_polarity.pdf"), auto_open=False)


Your filename `../plots/forums/sentiment/lisiting_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/sentiment/lisiting_polarity.pdf.html'

Link text

In [22]:
link_texts_text = list(df["link-text"])
polarity = []
for link_texts in link_texts_text:
    temp_polarity = []
    if link_texts != None:
        for link_text in link_texts:
            temp_polarity.append(TextBlob(str(link_text)).sentiment.polarity)
    else:
        temp_polarity.append(0.0)
    polarity.append(temp_polarity)

positive_polarity = [np.mean([positve_instances if positve_instances > 0 else 0.0 for positve_instances in instances]) for instances in polarity]
negative_polarity = [np.mean([negative_instances if negative_instances < 0 else 0.0 for negative_instances in instances]) for instances in polarity]

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=positive_polarity, name="positive", marker_color="blue"))
fig.add_trace(go.Bar(x=np.array(range(len(polarity))), y=negative_polarity, name="negative", marker_color="red"))
fig.update_layout(
                    title_text='Link_texts Polarity, +ve polarity=' + str(round(np.mean(positive_polarity), 4)) + ' ,-ve polarity=' + str(round(np.mean(negative_polarity), 4)),
                    xaxis_title="Web pages",
                    yaxis_title="Sentiment"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/sentiment/", "link_polarity.pdf"), auto_open=False)


Your filename `../plots/forums/sentiment/link_polarity.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/sentiment/link_polarity.pdf.html'

# Average Sentence length Analysis

title

In [24]:
title_text = list(df['title'])
title_text_len = np.array([len(str(text).split(" ")) if text!="None" else 0 for text in title_text])
mean_title = np.mean([value for value in title_text_len if value!=0])
std_title = np.std([value for value in title_text_len if value!=0])

In [25]:
title_dic = Counter(title_text_len)

In [26]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(list(title_dic.values())), y=list(title_dic.keys()), name="Forums", marker_color="red"))

# Add title
fig.update_layout(
                    title_text='Title length with the mean of : ' + str(round(mean_title,2)) + " and std dev of : " + str(round(std_title,2)),
                    xaxis_title="No of Web pages",
                    yaxis_title="Title length"
        )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/length/", "title_len.pdf"), auto_open=False)

'../plots/forums/length/title_len.pdf.html'

paragraphs

In [27]:
paragraph_text = list(df['paragraphs'])
paragraphs_text_length = []
for paragraphs in paragraph_text:
    temp_para_len_list = []
    if paragraphs != None:
        for para in paragraphs:
            para = para.split(" ")
            temp_para_len_list.append(len(para))
    else:
        temp_para_len_list.append(0)
    paragraphs_text_length.append(temp_para_len_list)

In [28]:
paragraphs_text_length = np.array([item for sublist in paragraphs_text_length for item in sublist])
mean_paragraph = np.mean([value for value in paragraphs_text_length if value!=0])
std_paragraph = np.std([value for value in paragraphs_text_length if value!=0])

In [29]:
paragraph_dic = Counter(paragraphs_text_length)

In [30]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(list(paragraph_dic.values())), y=list(paragraph_dic.keys()), name="Forums", marker_color="red"))

# Add title
fig.update_layout(title_text='Sentence length with the mean of : ' + str(round(mean_paragraph,2)) + " and std dev of : " + str(round(std_paragraph,2)))
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/length/", "paragraphs_len.pdf"), auto_open=False)

'../plots/forums/length/paragraphs_len.pdf.html'

headings

In [31]:
heading_text = list(df['headings'])
headings_text_length = []
for headings in heading_text:
    temp_para_len_list = []
    if headings != None:
        for para in headings:
            para = para.split(" ")
            temp_para_len_list.append(len(para))
    else:
        temp_para_len_list.append(0)
    headings_text_length.append(temp_para_len_list)

headings_text_length = np.array([item for sublist in headings_text_length for item in sublist])
mean_heading = np.mean([value for value in headings_text_length if value!=0])
std_heading = np.std([value for value in headings_text_length if value!=0])

heading_dic = Counter(headings_text_length)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(list(heading_dic.values())), y=list(heading_dic.keys()), name="Forums", marker_color="red"))

# Add title
fig.update_layout(title_text='Sentence length with the mean of : ' + str(round(mean_heading,2)) + " and std dev of : " + str(round(std_heading,2)))
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/length/", "headings_len.pdf"), auto_open=False)

'../plots/forums/length/headings_len.pdf.html'

Listings

In [32]:
listing_text = list(df['listings'])
listings_text_length = []
for listings in listing_text:
    temp_para_len_list = []
    if listings != None:
        for para in listings:
            para = para.split(" ")
            temp_para_len_list.append(len(para))
    else:
        temp_para_len_list.append(0)
    listings_text_length.append(temp_para_len_list)


listings_text_length = np.array([item for sublist in listings_text_length for item in sublist])
mean_listing = np.mean([value for value in listings_text_length if value!=0])
std_listing = np.std([value for value in listings_text_length if value!=0])

listing_dic = Counter(listings_text_length)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(list(listing_dic.values())), y=list(listing_dic.keys()), name="Forums", marker_color="red"))

# Add title
fig.update_layout(title_text='Sentence length with the mean of : ' + str(round(mean_listing,2)) + " and std dev of : " + str(round(std_listing,2)))
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/length/", "listing_len.pdf"), auto_open=False)

'../plots/forums/length/listing_len.pdf.html'

Link text

In [33]:
link_text = list(df['link-text'])
links_text_length = []
for links in link_text:
    temp_para_len_list = []
    if links != None:
        for para in links:
            para = para.split(" ")
            temp_para_len_list.append(len(para))
    else:
        temp_para_len_list.append(0)
    links_text_length.append(temp_para_len_list)


links_text_length = np.array([item for sublist in links_text_length for item in sublist])
mean_link = np.mean([value for value in links_text_length if value!=0])
std_link = np.std([value for value in links_text_length if value!=0])

link_dic = Counter(links_text_length)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(list(link_dic.values())), y=list(link_dic.keys()), name="Forums", marker_color="red"))


# Add title
fig.update_layout(title_text='Sentence length with the mean of : ' + str(round(mean_link,2)) + " and std dev of : " + str(round(std_link,2)))
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/length/", "link_len.pdf"), auto_open=False)


Your filename `../plots/forums/length/link_len.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/length/link_len.pdf.html'

# Term Frequency Analysis

Title

In [34]:
title_text = list(df['title'])
title_text_tokenize = [str(title).split(" ") for title in title_text if title!="None"]

In [35]:
tokens = [item for sublist in title_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

In [36]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="red"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/term-frequency", "title.pdf"), auto_open=False)


Your filename `../plots/forums/term-frequency/title.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/term-frequency/title.pdf.html'

Description

In [37]:
description_text = list(df['description'])
description_text_tokenize = []
for descriptions in description_text:
    temp_description_list = []
    if descriptions != None:
        for des in descriptions:
            des = des.split(" ")
            description_text_tokenize.append(des)
    else:
        pass

In [38]:
tokens = [item for sublist in description_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

In [39]:
analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

In [40]:
fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="red"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/term-frequency", "description.pdf"), auto_open=False)


Your filename `../plots/forums/term-frequency/description.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/term-frequency/description.pdf.html'

Paragraphs

In [41]:
paragraph_text = list(df['paragraphs'])
paragraph_text_tokenize = []
for paragraphs in paragraph_text:
    temp_paragraph_list = []
    if paragraphs != None:
        for value in paragraphs:
            value = value.split(" ")
            paragraph_text_tokenize.append(value)
    else:
        pass

tokens = [item for sublist in paragraph_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="red"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/term-frequency", "paragraphs.pdf"), auto_open=False)


Your filename `../plots/forums/term-frequency/paragraphs.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/term-frequency/paragraphs.pdf.html'

Headings

In [42]:
heading_text = list(df['headings'])
heading_text_tokenize = []
for headings in heading_text:
    temp_heading_list = []
    if headings != None:
        for value in headings:
            value = value.split(" ")
            heading_text_tokenize.append(value)
    else:
        pass

tokens = [item for sublist in heading_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="red"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/term-frequency", "headings.pdf"), auto_open=False)


Your filename `../plots/forums/term-frequency/headings.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/term-frequency/headings.pdf.html'

Lisitings

In [43]:
listing_text = list(df['listings'])
listing_text_tokenize = []
for listings in listing_text:
    temp_listing_list = []
    if listings != None:
        for value in listings:
            value = value.split(" ")
            listing_text_tokenize.append(value)
    else:
        pass

tokens = [item for sublist in listing_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="red"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/term-frequency", "listings.pdf"), auto_open=False)


Your filename `../plots/forums/term-frequency/listings.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/term-frequency/listings.pdf.html'

Links

In [44]:
link_text = list(df['link-text'])
link_text_tokenize = []
for links in link_text:
    temp_link_list = []
    if links != None:
        for value in links:
            value = value.split(" ")
            link_text_tokenize.append(value)
    else:
        pass

tokens = [item for sublist in link_text_tokenize for item in sublist]
tokens = list(filter(None, tokens))

analysis = FreqDist(tokens).most_common()
x, y = zip(*analysis)

fig = go.Figure()
fig.add_trace(go.Bar(x=np.array(x), y=y, name="Markets", marker_color="red"))
fig.update_layout(
                    title_text='Term Frequencies',
                    xaxis_title="Tokens",
                    yaxis_title="Frequency"
                    )
plotly.offline.plot(fig, filename = os.path.join("../plots/forums/term-frequency", "links.pdf"), auto_open=False)


Your filename `../plots/forums/term-frequency/links.pdf` didn't end with .html. Adding .html to the end of your file.



'../plots/forums/term-frequency/links.pdf.html'