# Scraping of 8kun videogames forum
We start by importing the request library and getting the main page.

In [50]:
import requests
from bs4 import BeautifulSoup

In [55]:
# Scrape and get links
def get_post_link(url: str) -> list[str]:
    """
    Scrapes the page and returns the link of the post
    :param url: the url of the main page with all threads
    :return: All urls to scrape
    """
    all_links_on_page: list[str] = []

    # Scrape
    r = requests.get(url)

    # Next, we use BeautifulSoup to parse the HTML and look into the contents.
    soup = BeautifulSoup(r.content, 'html.parser')

    # The forum consists of multiple threads. Every thread is in a div item with class=thread, so we will find all of the threads first.
    threads = soup.find_all("div", {"class": "thread"})

    # Get links from threads
    for thread in threads:
        # Get the main thread post (original post)
        original_post = thread.find('div', {'class': 'post op has-file body-not-empty'})
        if original_post:
            # # Get the title
            # title = original_post.find('a', {'class': 'thread_subject_link'})
            # if not title:
            #     continue
            # print(f"Title: {title.text}")

            # Get the link to the full thread
            if original_post.find('a', {'class': 'open_thread_index'}):
                link_to_thread = 'https://8kun.top' + original_post.find('a', {'class': 'open_thread_index'})['href']
                all_links_on_page.append(link_to_thread)

        # If it cannot be found, go to the next thread
        else:
            continue

    return all_links_on_page

In [57]:
from time import sleep
from random import randint

# List of all links to scrape:
urls_to_scrape: list[list[str]] = []

# Main URL
url = 'https://8kun.top/v/index.html'

urls_to_scrape.append(get_post_link(url))


for page in range(2, 11):  # There are only 10 pages
    sleep(randint(1,7))
    url = f'https://8kun.top/v/{page}.html?'
    urls_to_scrape.append(get_post_link(url))

Next download the forums themselves.

In [59]:
print(urls_to_scrape)

[['https://8kun.top/v/res/16960643.html', 'https://8kun.top/v/res/16960641.html', 'https://8kun.top/v/res/16960640.html', 'https://8kun.top/v/res/16960639.html', 'https://8kun.top/v/res/16960637.html', 'https://8kun.top/v/res/16944767.html', 'https://8kun.top/v/res/16960620.html', 'https://8kun.top/v/res/16960616.html', 'https://8kun.top/v/res/16960615.html', 'https://8kun.top/v/res/16960614.html', 'https://8kun.top/v/res/16959090.html', 'https://8kun.top/v/res/16960608.html', 'https://8kun.top/v/res/16960600.html', 'https://8kun.top/v/res/16960605.html', 'https://8kun.top/v/res/16960159.html'], ['https://8kun.top/v/res/16956390.html', 'https://8kun.top/v/res/16952150.html', 'https://8kun.top/v/res/16960516.html', 'https://8kun.top/v/res/16960466.html', 'https://8kun.top/v/res/16960480.html', 'https://8kun.top/v/res/16960456.html', 'https://8kun.top/v/res/16959924.html', 'https://8kun.top/v/res/16960467.html', 'https://8kun.top/v/res/16958787.html', 'https://8kun.top/v/res/16960459.htm

## Scrape all threads
This is a new web page, so we create a new request.

In [69]:
from pandas import DataFrame
from typing import Optional
import pandas as pd

video_games_data: Optional[DataFrame] = None

for list_of_threads in urls_to_scrape:
    for url in list_of_threads:
        sleep(randint(1,7))
        r = requests.get(url)

        soup_thread = BeautifulSoup(r.content, 'html.parser')

        original_post = soup_thread.find('div', {'class': 'post op has-file body-not-empty'})

        if original_post is None:
            continue

        title_list = []
        poster_id_list = []
        post_number_list = []
        replies_to_list = []
        post_text_list = []
        links_list = []

        # Get the title
        title = original_post.find('span', {'class': 'subject'})
        if title is None:
            continue

        # Find all replies
        replies = soup_thread.find_all('div', {'class': 'post reply body-not-empty'})

        for reply in replies:
            # Get the poster ID
            poster_id = reply.find('span', {'class': 'poster_id'})
            if not poster_id:
                continue
            poster_id_list.append(poster_id.text)

            # Get the post number
            post_no = reply.find('a', {'class': 'post_no'})['href'].split('#')[-1]
            post_number_list.append(post_no)

            # Get the text
            text = reply.find('div', {'class': 'body'})
            if not text:
                continue

            # Get replies separate
            if '>>' in text.text:
                splitted_text = text.text.split('>>')
                count_split = 0
                for split in splitted_text:
                    if len(split) > 0:
                        replies_to_list.append(split[:8])
                        post_text_list.append(split[8:])

                        # Check for links and save them
                        if 'http' in split[6:]:
                            links_to_add = []
                            link_splitter = split[6:].split('http')
                            for link_split in link_splitter:
                                if '//' in link_split:
                                    links_to_add.append('http' + link_split.split(' ')[0])
                            links_list.append(links_to_add)
                        else:
                            links_list.append(None)
                        count_split += 1
                while count_split > 1:
                    poster_id_list.append(poster_id.text)
                    post_number_list.append(post_no)
                    count_split -= 1
            else:
                replies_to_list.append(None)
                post_text_list.append(text.text)
                # Check for links and save them
                if 'http' in text.text:
                    links_to_add = []
                    link_splitter = text.text.split('http')
                    for link_split in link_splitter:
                        if '//' in link_split:
                            links_to_add.append('http' + link_split.split(' ')[0])
                    links_list.append(links_to_add)
                else:
                    links_list.append(None)


        data = {
            'thread_title':title,
            'post_number':post_number_list,
            'poster_id': poster_id_list,
            'replies_to': replies_to_list,
            'text': post_text_list,
            'links': links_list,
        }

        if video_games_data is None:
            video_games_data = pd.DataFrame(data)
        else:
            video_games_data = pd.concat([video_games_data, pd.DataFrame(data)])

## Data cleaning
First, we delete > from text.

In [70]:
video_games_data['text'] = video_games_data['text'].apply(lambda x: x.replace(">", ""))
video_games_data['text']

0                                  Why isn't this loli?
1        Because technically she already is canonically
2                                              Yennefer
3                   tfw no mark and luciano r34why live
4                            he calls women cute Soyboy
                            ...                        
67    What do you mean no unlocks? Does it just get ...
68    Without something like the Peacock Emulator yo...
69    Female protagonists, seriously there are too many
70    sounds like you guys just want to go outside o...
71     not being able to save wherever you are on a map
Name: text, Length: 1319, dtype: object

In [71]:
video_games_data.loc[5]['replies_to']

5    16944876
5    16956409
5        None
5    16959979
5    16959626
5    16951489
5        None
5    16959200
5    16943940
5    16724866
5        None
5        None
5        None
5    16956505
5    16957102
5        None
5    16943533
5        None
5    16943541
5    16951361
5    16949658
5        None
5        None
Name: replies_to, dtype: object

In [72]:
video_games_data

Unnamed: 0,thread_title,post_number,poster_id,replies_to,text,links
0,Who is the hottest Vidya girl,16944768,98f48b,,Why isn't this loli?,
1,Who is the hottest Vidya girl,16944769,f89fad,16944768,Because technically she already is canonically,
2,Who is the hottest Vidya girl,16944793,beab64,,Yennefer,
3,Who is the hottest Vidya girl,16944829,9d2654,16944821,tfw no mark and luciano r34why live,
4,Who is the hottest Vidya girl,16944867,f89fad,16944852,he calls women cute Soyboy,
...,...,...,...,...,...,...
67,Things that you hate in video games,16959218,e526a8,16953728,What do you mean no unlocks? Does it just get ...,
68,Things that you hate in video games,16959220,879cf4,16959218,Without something like the Peacock Emulator yo...,
69,Things that you hate in video games,16959517,55cb92,,"Female protagonists, seriously there are too many",
70,Things that you hate in video games,16959664,f49d91,,sounds like you guys just want to go outside o...,


In [73]:
video_games_data.to_csv("forum_8kun.csv")