In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Load the dataset
file_path = '/content/Texts_file.xlsx'
df = pd.read_excel(file_path, sheet_name='Sheet1')

# Text cleaning function
def clean_text(text):
    text = re.sub(r'\n', ' ', text)  # Remove newline characters
    text = re.sub(r'\r', ' ', text)  # Remove carriage return characters
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    return text

# Apply text cleaning
df['Text'] = df['Text'].apply(clean_text)

# Split the data into features and target variable
X = df['Text']
y = df['Tag']

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X_vect = vectorizer.fit_transform(X)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_vect, y)

# Convert the balanced data back to a DataFrame
X_balanced_df = pd.DataFrame(X_balanced.toarray(), columns=vectorizer.get_feature_names_out())
y_balanced_df = pd.DataFrame(y_balanced, columns=['Tag'])
balanced_df = pd.concat([y_balanced_df, X_balanced_df], axis=1)

# Save the balanced dataset to an Excel file
output_file_path = '/content/balanced_dataset.xlsx'
balanced_df.to_excel(output_file_path, index=False)

output_file_path
