In [None]:
import pandas as pd
import re

def clean_text(text):
    # Remove missing values
    text = text.dropna()
    
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove usernames (e.g., @user)
    text = re.sub(r'@\w+', '', text)
    
    # Remove specified special characters
    characters_to_remove = ['ç', '–', 'é', '∂', '¥', 'ü', '™', '~', '∏', '‡', '¢', '´', 'Ì', 'º',
                            '‰', 'ë', 'Ç', '¨', 'ì', 'Í', '|', 'Õ', '[', '\uf8ff', '≤', '⁄',
                            '\xa0', '@', '(', '$', '*', '∞', ';', 'ú', 'Æ', 'ê', '≠', '◊', 'ƒ',
                            'Á', '°', '#', '%', ')', '≥', '&', '«', 'Ü', 'Ô', '\\', '∫', 'å',
                            'É', 'Â', 'ñ', '\n', 'Ë', 'í', '¬', 'Ø', 'ù', 'Ω', '\x7f', 'È',
                            'Ñ', '}', '®', '√', '{', 'ö', ':', '§', '+', '\t', 'à', 'Ö', '…',
                            'Û', 'æ', '/', '‚', 'á', 'î', 'ó', '_', '—', '∆', '•', 'œ', 'è',
                            'ï', 'õ', 'Ó', '’', '≈', '±', '·', '€', 'ã', '„', '¶', '∑', '†',
                            'ª', 'Œ', 'Ï', '`', 'û', 'Ÿ', '^', 'µ', 'À', '”', 'Å', 'ø', 'Î',
                            'ß', 'Ê', 'Ã', '>', 'ô', 'π', 'â', '“', 'ÿ', '£', 'ä', 'ò', '=',
                            'Ä', '©', '-']

    # Clean special characters
    text = re.sub(f"[{''.join(re.escape(char) for char in characters_to_remove)}]", '', text)
    
    return text

In [None]:
# Load the dataset
file_path = 'path-to-dataset'
df = pd.read_excel(file_path)

In [None]:
# Apply the cleaning function
df['text'] = df['text'].apply(clean_text)

# Save the cleaned file
output_path = 'path-to-save'
df.to_excel(output_path, index=False)

print(f"{len(df)} rows remaining.")

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the cleaned dataset
file_path = 'path-to-dataset'
df = pd.read_excel(file_path)

# Ensure that the dataset is balanced between the two classes
df_class_0 = df[df['class'] == 0]
df_class_1 = df[df['class'] == 1]

# Split each class into train and test sets with a 70/30 ratio
train_class_0, test_class_0 = train_test_split(df_class_0, test_size=0.3, random_state=42)
train_class_1, test_class_1 = train_test_split(df_class_1, test_size=0.3, random_state=42)

# Combine the train and test sets from both classes
train_df = pd.concat([train_class_0, train_class_1])
test_df = pd.concat([test_class_0, test_class_1])

# Shuffle the data
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the split datasets
train_output_path = 'path-to-save-train_dataset.xlsx'
test_output_path = 'path-to-save-test_dataset.xlsx'
train_df.to_excel(train_output_path, index=False)
test_df.to_excel(test_output_path, index=False)

print(f"Training set size: {len(train_df)} rows")
print(f"Test set size: {len(test_df)} rows")