# Extracting the data from the scraped 8chan forum (Dark Web)
We start by importing the scraped data that was saved in the txt file.

In [1]:
import ast

with open('8chan_dark_web.txt', 'r') as text_file:
    content = ''
    chunk = text_file.read(1024)  # Read 1KB at a time
    while chunk:
        content += chunk
        chunk = text_file.read(1024)

    # Convert the string list to actual list
    video_games_content_list = ast.literal_eval(content)

In [48]:
from bs4 import BeautifulSoup
from pandas import DataFrame

video_games_data: DataFrame

# Get information from the page content
title_list = []
poster_id_list = []
post_number_list = []
replies_to_list = []
post_text_list = []

for page_content in video_games_content_list:
    soup = BeautifulSoup(page_content)

    # Find the original post and the replies
    original_post, replies = soup.find('div', {'class': 'innerOP'}), soup.find('div', {'class': 'divPosts'})

    # If either of them cannot be found, skip this entry
    if original_post is None or replies is None:
        continue

    # Get the title
    title = original_post.find('span', {'class': 'labelSubject'})
    if title is not None:
        title = title.text

    # Go over all replies
    for reply in replies:
        # Get the poster ID
        poster_id = reply.find('span', {'class': 'labelId'})
        if poster_id is not None:
            poster_id = poster_id.text

        # Get the post number
        post_no = reply.find('a', {'class': 'linkQuote'})
        if post_no is not None:
            post_no = post_no.text

        # Get the text of the post
        text = reply.find('div', {'class': 'divMessage'})
        if text is None:
            print('text is none!')
            continue

        # Get all replies separately
        if '>>' in text.text:
            splitted_text = text.text.split('>>')
            for split in splitted_text:
                if len(split) > 0:
                    title_list.append(title)
                    poster_id_list.append(poster_id)
                    post_number_list.append(post_no)
                    replies_to_list.append(split[:6])
                    post_text_list.append(split[6:])
        else:
            title_list.append(title)
            poster_id_list.append(poster_id)
            post_number_list.append(post_no)
            replies_to_list.append(None)
            post_text_list.append(text.text)

In [49]:
print(len(title_list), len(poster_id_list), len(post_number_list), len(replies_to_list), len(post_text_list))

32420 32420 32420 32420 32420


In [51]:
from typing import Optional
import pandas as pd
from bs4 import BeautifulSoup
from pandas import DataFrame

video_games_data: Optional[DataFrame] = None

# Get information from the page content
title_list = []
poster_id_list = []
post_number_list = []
replies_to_list = []
post_text_list = []

for page_content in video_games_content_list:
    soup = BeautifulSoup(page_content)

    # Find the original post and the replies
    original_post, replies = soup.find('div', {'class': 'innerOP'}), soup.find('div', {'class': 'divPosts'})

    # If either of them cannot be found, skip this entry
    if original_post is None or replies is None:
        continue

    # Get the title
    title = original_post.find('span', {'class': 'labelSubject'})
    if title is not None:
        title = title.text

    # Go over all replies
    for reply in replies:
        # Get the poster ID
        poster_id = reply.find('span', {'class': 'labelId'})
        if poster_id is not None:
            poster_id = poster_id.text

        # Get the post number
        post_no = reply.find('a', {'class': 'linkQuote'})
        if post_no is not None:
            post_no = post_no.text

        # Get the text of the post
        text = reply.find('div', {'class': 'divMessage'})
        if text is None:
            print('text is none!')
            continue

        # Get all replies separately
        if '>>' in text.text:
            splitted_text = text.text.split('>>')
            for split in splitted_text:
                if len(split) > 0:
                    poster_id_list.append(poster_id)
                    post_number_list.append(post_no)
                    replies_to_list.append(split[:6])
                    post_text_list.append(split[6:])
        else:
            poster_id_list.append(poster_id)
            post_number_list.append(post_no)
            replies_to_list.append(None)
            post_text_list.append(text.text)

    data = {
        'thread_title':title,
        'post_number':post_number_list,
        'poster_id': poster_id_list,
        'replies_to': replies_to_list,
        'text': post_text_list,
    }

    if video_games_data is None:
        video_games_data = pd.DataFrame(data)
    else:
        video_games_data = pd.concat([video_games_data, pd.DataFrame(data)])

In [52]:
video_games_data

Unnamed: 0,thread_title,post_number,poster_id,replies_to,text
0,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823411,f065dd,,Anchor post. Link your demos here.
1,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823419,f065dd,823411,\nHere is my GZDoom weapon mod (Renamon Doom M...
2,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823454,1bd3d8,823411,\nNEON BLASTER (5/5/2023 BUILD)\n\nMega link: ...
3,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823660,f065dd,823454,\nPlayed a bit of your demo. Well it works und...
4,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823804,5c3c09,"Ehehe,",looking about as sparse as last demo day too....
...,...,...,...,...,...
32415,SMT Imagine lawsuit is done,781689,94f870,781299,\nDeleting everything on github was explicitly...
32416,SMT Imagine lawsuit is done,783223,477034,781689,\nIf r*ddit can...
32417,SMT Imagine lawsuit is done,784839,94f870,783223,\nAnd there are still several forks of the cod...
32418,SMT Imagine lawsuit is done,802667,94713d,,"Did anything ever happen with this, such as so..."


## Data cleaning
First, we delete > and \n from the text.

In [53]:
video_games_data['text'] = video_games_data['text'].apply(lambda x: x.replace(">", "").replace("\n", ""))
video_games_data['text']

0                       Anchor post. Link your demos here.
1        Here is my GZDoom weapon mod (Renamon Doom Mod...
2        NEON BLASTER (5/5/2023 BUILD)Mega link: https:...
3        Played a bit of your demo. Well it works under...
4          looking about as sparse as last demo day too...
                               ...                        
32415    Deleting everything on github was explicitly o...
32416                                     If r*ddit can...
32417    And there are still several forks of the code ...
32418    Did anything ever happen with this, such as so...
32419                      It sounds pretty much the same.
Name: text, Length: 3210188, dtype: object

In [55]:
video_games_data

Unnamed: 0,thread_title,post_number,poster_id,replies_to,text
0,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823411,f065dd,,Anchor post. Link your demos here.
1,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823419,f065dd,823411,Here is my GZDoom weapon mod (Renamon Doom Mod...
2,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823454,1bd3d8,823411,NEON BLASTER (5/5/2023 BUILD)Mega link: https:...
3,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823660,f065dd,823454,Played a bit of your demo. Well it works under...
4,OFFICIAL DEMO DAY THREAD ~ /v/ + /agdg/,823804,5c3c09,"Ehehe,",looking about as sparse as last demo day too...
...,...,...,...,...,...
32415,SMT Imagine lawsuit is done,781689,94f870,781299,Deleting everything on github was explicitly o...
32416,SMT Imagine lawsuit is done,783223,477034,781689,If r*ddit can...
32417,SMT Imagine lawsuit is done,784839,94f870,783223,And there are still several forks of the code ...
32418,SMT Imagine lawsuit is done,802667,94713d,,"Did anything ever happen with this, such as so..."


We save the data as csv.

In [56]:
video_games_data.to_csv("forum_dark_web_8chan.csv")