In [None]:
import glob
import pandas as pd
import os

folder_path = ""  # Navigate to the main folder where the data fragments are located and combine them into a single file.
all_dfs = []

for root, _, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            df = pd.read_csv(file_path, sep="|")
            all_dfs.append(df)

# Combine to a single df
concatenated_df = pd.concat(all_dfs, ignore_index=True)
concatenated_df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# Language selection. We only accept Cyrillic characters.
import fasttext
from tqdm import tqdm

tqdm.pandas()
model = fasttext.load_model("lid.176.bin")

def detect_lang(text):
    pred = model.predict(text.replace("\n", " "), k=1)
    return pred[0][0].replace("__label__", "")

concatenated_df['lang'] = concatenated_df['text'].apply(lambda x: detect_lang(x) if isinstance(x, str) else '')
df = concatenated_df[concatenated_df['lang'] == 'ru']

In [None]:
# Add a title and description of the link to the post text if it was reposted:

df = df.fillna('')
df['text']=df['text'].str.replace('Запись удалена', "")
df['repost_text']=df['repost_text'].str.replace('Запись удалена', "")
# df['history_text']=df['history_text'].str.replace('Запись удалена', "")

def merge_texts(row):
    texts = [row["text"]]  # Always start with the text of the post

    if row["link_title"] and row["link_title"] != row["text"]:
        texts.append(row["link_title"])  # Add a link title if it does not match the post text
    
    if row["link_description"] and row["link_description"] not in texts:
        texts.append(row["link_description"])  # Add a description for the link if it is not already there.
    
    if row["repost_text"] and row["repost_text"] not in texts:
        texts.append(row["repost_text"])
    
#     if row["history_text"] and row["history_text"] not in texts:
#         texts.append(row["history_text"]) 
        
    return ". ".join(map(str, texts)).strip()

df["text"] = df.apply(merge_texts, axis=1)


In [None]:
# df.drop(['link_description', 'repost_text', 'history_text', 'lang'], axis=1, inplace=True)
df.drop(['link_description', 'repost_text', 'lang'], axis=1, inplace=True)

In [None]:
import nltk
nltk.download('stopwords')
# Lemmatization
import re
import pymorphy2
morph = pymorphy2.MorphAnalyzer()
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from collections import Counter

russian_stopwords = stopwords.words("russian")   
rsw = [word for word in russian_stopwords if word !="не"]

tokenizer = RegexpTokenizer('[А-Я|Ё|а-я|ё|#]+')
morph = pymorphy2.MorphAnalyzer()

def preprocessing(plain_text):
    intermediate = tokenizer.tokenize(plain_text.lower())
    intermediate = [morph.parse(i)[0].normal_form for i in intermediate if (len(i)>1)&('#' not in i)]
    words_lemmatized_list = [i for i in intermediate if i not in rsw]
    return words_lemmatized_list

def form_bigrams_list(preprocessed_text):
    bigrams_list=[]
    biword =  [b for b in nltk.bigrams(preprocessed_text)]
    counts_bi = Counter(biword)
    for char in counts_bi.keys():
        bigrams_list.append('_'.join(char))
    return bigrams_list
    

In [None]:
from tqdm import tqdm

tqdm.pandas()

def process_text(text):
    preprocessed_text = preprocessing(text)
    biword = form_bigrams_list(preprocessed_text)
    return preprocessed_text, biword

df[['unigrams', 'bigrams']] = df['text'].progress_apply(lambda x: process_text(x)).apply(pd.Series)

In [None]:
#============ !!! !! Add removal of ID and other personal information in post texts!!!   ============
import re

def clean_personal_data(text):
    """Removes from texts: IDs, mentions, links, phone numbers, email addresses, and card numbers."""
    
    # Removing user and group IDs
    text = re.sub(r'\b(id|club)\d+\b', '', text)
    
    # Removing mentions via @
    text = re.sub(r'@\S+', '', text)
    
    # Removing links to VK pages
    text = re.sub(r'https?://(?:m\.)?vk\.com/\S+', '[LINK]', text)
    
    # Removing phone numbers
    text = re.sub(r'(\+7|8)[\s\-]?\(?\d{3}\)?[\s\-]?\d{2,3}[\s\-]?\d{2,3}[\s\-]?\d{2,4}', '[PHONE]', text)
    
    # Removing email addresses
    text = re.sub(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '[EMAIL]', text)
    
    # removing card numbers
    text = re.sub(r'\b(?:\d{4}[-\s]?){3}\d{4}\b', '[CARD]', text)

    return text.strip()  # Remove extra spaces

df['text'] = df['text'].apply(clean_personal_data)

df.to_csv(folder+file, sep='|', encoding='utf-8')  # add you fold and file name
