### Data scrapping from flibusta.is wesite

In [14]:
!pip install selenium -q

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd


chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(options=chrome_options)
driver.get('https://flibusta.is/g/sf_fantasy')


links = []

for a in range(1, 9573):

  url = driver.find_element(By.XPATH, f'//*[@id="main"]/form/ol/a[{a}]').get_attribute('href')

  driver1 = webdriver.Chrome(options=chrome_options)
  driver1.get(url)



  try:

    for x in driver1.find_elements(By.XPATH, '//*[@id="main"]/div[3]/a[2]'):
        if x.text == "(читать)":
            links.append(x.get_attribute('href'))
  except:
    pass

link = []
heading = []
content = []


for l in links:

  link.append(l)
  driver.get(l)
  heading.append(driver.find_element(By.CLASS_NAME, 'title').text)

  str = ''
  for x in driver.find_elements(By.TAG_NAME, 'p'):
    str += x.text

  content.append(str)

df = pd.DataFrame({
    'link':link,
    'heading':heading,
    'content':content
})


df.to_csv('flibusta.csv', index=False)

In [26]:
import pandas as pd
import os

os.mkdir('input_folder')
df = pd.read_csv('flibusta.csv')

a = 0

for x in df['content']:

    try:
        a +=1
        with open(f'input_folder/{a}.txt', 'w') as f:
             f.write(x)

    except:
        pass

### Bad words removal

In [27]:
import os
import re

bad_words = pd.read_csv('russian_badwords.csv')

# Function to replace bad words with empty space in the text
def replace_bad_words(text, bad_words):
    pattern = '|'.join([re.escape(word) for word in bad_words])
    return re.sub(pattern, '', text, flags=re.IGNORECASE)

# Function to process the files
def process_txt_files(input_folder, output_folder, bad_words):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)

            with open(input_file_path, 'r', encoding='utf-8') as file:
                content = file.read()

            cleaned_content = replace_bad_words(content, bad_words)
            output_file_path = os.path.join(output_folder, filename)
            with open(output_file_path, 'w', encoding='utf-8') as new_file:
                new_file.write(cleaned_content)

            print(f"Processed file: {filename}")

input_folder = "/content/input_folder"
output_folder = "/content/cleaned_data"
process_txt_files(input_folder, output_folder, bad_words['0'])

Processed file: 1.txt
Processed file: 2.txt


### Deduplication using Simhash

In [30]:
!pip install simhash -q
from simhash import Simhash
import os


# function to calculate hamming disance b/w two hash values
def hamming_distance(hash1, hash2):
    x = hash1 ^ hash2
    return bin(x).count('1')


def calculate_similarity_percentage(hash1, hash2):
    max_distance = 128
    distance = hamming_distance(hash1, hash2)
    similarity = ((max_distance - distance) / max_distance) * 100
    return similarity


# Function to check if a file is a duplicate based on its Simhash value
def is_duplicate_file(file_hash, seen_hashes, similarity_threshold):
    for seen_hash in seen_hashes:
        similarity = calculate_similarity_percentage(file_hash, seen_hash)
        if similarity >= similarity_threshold:
            return True
    return False


# Function to deduplicate files in a folder using Simhash
def deduplicate_files_simhash(input_folder, output_folder, similarity_threshold=100):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    seen_hashes = []

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)
            with open(input_file_path, 'r', encoding='utf-8') as file:
                file_content = file.read()

            file_hash = Simhash(file_content, f=128).value

            if not is_duplicate_file(file_hash, seen_hashes, similarity_threshold):
                seen_hashes.append(file_hash)
                output_file_path = os.path.join(output_folder, filename)
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(file_content)
                print(f"Saved unique file: {filename}")
            else:
                print(f"Duplicate file skipped: {filename}")


input_folder = "/content/cleaned_data"
output_folder = "/content/simhash_deduplicated"

deduplicate_files_simhash(input_folder, output_folder)

Saved unique file: 1.txt
Saved unique file: 2.txt


### Deduplication using Minhash


In [None]:
! pip install datasketch
from datasketch import MinHash

def tokenize(text):
    return text.lower().split()

# function to create a MinHash object for a set of tokens.
def create_minhash(tokens, num_perm=128):
    minhash = MinHash(num_perm=num_perm)
    for token in tokens:
        minhash.update(token.encode('utf8'))
    return minhash

# function to calculate jaccard similarity
def calculate_jaccard_similarity(minhash1, minhash2):
    return minhash1.jaccard(minhash2)

# function for deduplication using Minhash
def deduplicate_texts_minhash(texts, similarity_threshold=0.8, num_perm=128):
    unique_texts = []
    minhashes = []

    for text in texts:
        tokens = tokenize(text)
        minhash = create_minhash(tokens, num_perm)
        is_duplicate = False
        for existing_minhash in minhashes:
            similarity = calculate_jaccard_similarity(minhash, existing_minhash)
            if similarity >= similarity_threshold:
                is_duplicate = True
                break

        if not is_duplicate:
            minhashes.append(minhash)
            unique_texts.append(text)

    return unique_texts

# function to process all text files
def process_files_minhash(input_folder, output_folder, similarity_threshold=0.8, num_perm=128):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)

            with open(input_file_path, 'r', encoding='utf-8') as file:
                texts = file.readlines()
            deduplicated_texts = deduplicate_texts_minhash(texts, similarity_threshold, num_perm)
            output_file_path = os.path.join(output_folder, filename)
            with open(output_file_path, 'w', encoding='utf-8') as output_file:
                output_file.write('\n'.join(deduplicated_texts))

            print(f"Processed file: {filename}")

input_folder = "/content/cleaned_data"
output_folder = "/content/simhash_deduplicated"

process_files_minhash(input_folder, output_folder)


In [None]:
from datasketch import MinHash

def tokenize(text):
    return text.lower().split()

# function to create a MinHash object for a set of tokens.
def create_minhash(tokens, num_perm=128):
    minhash = MinHash(num_perm=num_perm)
    for token in tokens:
        minhash.update(token.encode('utf8'))
    return minhash

# function to calculate jaccard similarity
def calculate_jaccard_similarity(minhash1, minhash2):
    return minhash1.jaccard(minhash2)

# Function to check if a file is a duplicate based on its MinHash value
def is_duplicate_file(minhash, existing_minhashes, similarity_threshold):
    for existing_minhash in existing_minhashes:
        similarity = calculate_jaccard_similarity(minhash, existing_minhash)
        if similarity >= similarity_threshold:
            return True
    return False

# Function to deduplicate files in a folder using MinHash
def deduplicate_files_minhash(input_folder, output_folder, similarity_threshold=0.8, num_perm=128):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    existing_minhashes = []

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            input_file_path = os.path.join(input_folder, filename)

            with open(input_file_path, 'r', encoding='utf-8') as file:
                file_content = file.read()

            tokens = tokenize(file_content)
            file_minhash = create_minhash(tokens, num_perm)

            if not is_duplicate_file(file_minhash, existing_minhashes, similarity_threshold):
                existing_minhashes.append(file_minhash)
                output_file_path = os.path.join(output_folder, filename)
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(file_content)
                print(f"Saved unique file: {filename}")
            else:
                print(f"Duplicate file skipped: {filename}")

input_folder = "/content/cleaned_data"
output_folder = "/content/simhash_deduplicated"

deduplicate_files_minhash(input_folder, output_folder)