In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
import re
import joblib

# Load the dataset
file_path = '/content/Text-classification.xlsx'
data = pd.read_excel(file_path)

# Check for class imbalance
class_distribution = data['Tag'].value_counts()
print("Class distribution:\n", class_distribution)

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply preprocessing
data['Text'] = data['Text'].apply(preprocess_text)

# Feature extraction
vectorizer = TfidfVectorizer(max_features=10000)

# Split data into features and labels
X = data['Text']
y = data['Tag']

# Step 1: Using SMOTE for balancing classes
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(vectorizer.fit_transform(X), y)

# Step 2: Employ appropriate evaluation metrics
def evaluate_model(model, X, y):
    skf = StratifiedKFold(n_splits=5)
    scores = cross_val_score(model, X, y, cv=skf, scoring='f1')
    print("F1 Score: ", np.mean(scores))

# Step 3: Using stratified sampling during cross-validation
skf = StratifiedKFold(n_splits=5)

# Step 4: Regularization techniques (using L2 regularization in Logistic Regression)
logreg = LogisticRegression(C=1.0, penalty='l2', random_state=42)

# Step 5: Ensemble methods
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

ensemble_clf = VotingClassifier(estimators=[
    ('lr', logreg),
    ('rf', rf_clf)
], voting='soft')

# Evaluate the ensemble model
evaluate_model(ensemble_clf, X_res, y_res)

# Fit the final model
ensemble_clf.fit(X_res, y_res)

# Save the model and vectorizer
model_path = '/content/ensemble_model.pkl'
vectorizer_path = '/content/vectorizer.pkl'
joblib.dump(ensemble_clf, model_path)
joblib.dump(vectorizer, vectorizer_path)

# Evaluate on original data (optional)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_test_vect = vectorizer.transform(X_test)
y_pred = ensemble_clf.predict(X_test_vect)

print(classification_report(y_test, y_pred))
print("AUC-ROC: ", roc_auc_score(y_test, ensemble_clf.predict_proba(X_test_vect)[:, 1]))
