In [None]:
import pandas as pd

In [None]:
df = pd.read_pickle("data/persian_english_tweets_onehashtag_twomonths_processed.pkl")

In [None]:
df = df[df["lang"]=="en"]

In [None]:
import re

# List of English and Persian numbers (formal and informal)
english_numbers_in_words = [
    'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten',
    'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen',
    'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy',
    'eighty', 'ninety', 'hundred', 'thousand', 'million', 'billion'
]

# Persian numbers in words
persian_numbers_in_words = [
    'یک', 'یه', 'دو', 'سه', 'چهار', 'پنج', 'شیش', 'شش', 'هفت', 'هشت', 'نه', 'ده',
    'یازده', 'دوازده', 'سیزده', 'چهارده', 'پانزده', 'پونزده', 'شانزده', 'شونزده', 'هفده', 'هیفده', 'هجده', 'نوزده', 'بیست',
    'سی', 'چهل', 'پنجاه', 'شصت', 'هفتاد', 'هشتاد', 'نود', 'صد', 'دویست',
    'سیصد', 'چهارصد', 'پانصد', 'ششصد', 'هفتصد', 'هشتصد', 'نهصد', 'هزار', 'میلیون', 'پونصد'
]

# Patterns for English and Persian numeric combinations
english_number_pattern = r'(?:\b(?:' + '|'.join(english_numbers_in_words) + r')\b(?:[-_\s]*(?:' + '|'.join(english_numbers_in_words) + r'))*)'
persian_number_pattern = r'(?:\b(?:' + '|'.join(persian_numbers_in_words) + r')\b(?:\sو\s(?:' + '|'.join(persian_numbers_in_words) + r'))*)'

# New pattern to catch cases where two numbers are joined together, e.g., دههزار or پانصد‌هزار
persian_number_combined_pattern = r'\b(?:' + '|'.join(persian_numbers_in_words) + r')(?=\d|\b)'

def standardize_age_terms(text):
    # Replace age-related terms with a placeholder
    age_pattern = (
        rf'(\b\d+\b|{english_number_pattern})[-_\s]*(years?[-_\s]*old|yo|yearold|years? old|yearsold|year-old|years-old)'
    )

    return re.sub(age_pattern, "mahsa_jina_age", text, flags=re.IGNORECASE)

def clean_tweet_with_standardized_ages(text, lang):
    # Replace age terms with a placeholder
    text = standardize_age_terms(text)

    # Remove URLs, mentions, and hashtags
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)    # Remove mentions
    text = re.sub(r'#\w+', '', text)    # Remove hashtags

    # Remove numeric patterns (digits and number words in English and Persian)
    text = re.sub(r'\b\d+\b', '', text)  # Remove standalone numeric digits
    text = re.sub(english_number_pattern, '', text, flags=re.IGNORECASE)  # Remove English number words
    text = re.sub(persian_number_pattern, '', text)  # Remove Persian number words

    # Remove attached Persian numbers (e.g., دههزار, پانصد‌هزار)
    text = re.sub(persian_number_combined_pattern, '', text)  # Remove combined Persian numbers

    # Remove non-English characters if the language is English
    if lang == 'en':
        text = re.sub(r'[\u0600-\u06FF]', '', text)  # Remove Persian characters

    # Remove repeated punctuation
    text = re.sub(r'(\.{2,}|!{2,}|\?{2,})', '.', text)

    # Remove one-letter words
    text = re.sub(r"\b(?![a-zA-Z]')\b\w\b", '', text)

    # Remove extra whitespace
    text = ' '.join(text.split()).strip()

    # Replace the placeholder with the standardized age term
    text = text.replace("mahsa_jina_age", "twenty-two years old")

    return text

# Apply the cleaning function
def preprocess_tweets(df):
    df['tweet_clean_embeddings'] = df.apply(
        lambda row: clean_tweet_with_standardized_ages(row['tweet'], row['lang']), axis=1
    )

    # Remove rows with less than 3 words or empty content
    df['tweet_clean_embeddings'] = df['tweet_clean_embeddings'].apply(
        lambda x: x if len(x.split()) >= 3 else ''
    )

    # Drop rows with empty tweets
    df = df[df['tweet_clean_embeddings'].str.strip() != '']

    return df

df = preprocess_tweets(df)

In [None]:
df.reset_index(drop=True, inplace=True)

In [2]:
from sklearn.model_selection import train_test_split
_ , df_sampled = train_test_split(df, test_size = 1000, stratify = df['date'], random_state=42)

In [6]:
df_sampled.sort_values(by="created_at", inplace=True)

In [7]:
df_sampled.to_excel("data/duplicates_labels.xlsx", columns=["author id", "created_at", "tweet_id", "tweet", "tweet_clean_embeddings"])