Data Loading

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 1️⃣ Prepare your features and target from your imbalanced dataset dataframe 'df_imbal'
X = df['lemmatized']  # your text column
y = df['Score']       # target column

Test Train Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)

TF-IDF Vectorizer

In [20]:
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

Imbalance Handling

In [21]:
logreg = LogisticRegression(class_weight='balanced')

hyperparameter grid for tuning

In [22]:
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],   # Regularization strength
    'penalty': ['l1', 'l2'],        # Regularization type
    'solver': ['liblinear'],        # Supports l1 and l2
    'max_iter': [100, 200, 300]
}

GridSearchCV for hyperparameter tuning

In [23]:
grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)

Fit grid search on imbalanced training data

In [24]:
grid_search.fit(X_train_vec, y_train)


Fitting 5 folds for each of 30 candidates, totalling 150 fits




Fetch best model

In [25]:
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)


Best hyperparameters: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}


Evaluate on the test set


In [26]:
y_pred = best_model.predict(X_test_vec)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           1       0.42      0.47      0.45        40
           2       0.13      0.10      0.11        60
           3       0.35      0.36      0.35       100
           4       0.46      0.44      0.45       120
           5       0.46      0.53      0.49        80

    accuracy                           0.39       400
   macro avg       0.37      0.38      0.37       400
weighted avg       0.38      0.39      0.38       400


Confusion Matrix:
[[19  6  8  5  2]
 [15  6 24  6  9]
 [ 7 16 36 28 13]
 [ 2 16 24 53 25]
 [ 2  3 11 22 42]]


Handling Different Feature Types

In [30]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

numeric_features = ['HelpfulnessNumerator', 'HelpfulnessDenominator', 'review_length', 'Time']
text_feature = 'Text'

preprocessor = ColumnTransformer([
    ('text', TfidfVectorizer(max_features=10000, ngram_range=(1,2), stop_words='english'), text_feature),
    ('num', StandardScaler(), numeric_features)
])


Split

In [31]:
from sklearn.model_selection import train_test_split

# y is your target, e.g. 'Score'
X = df
y = df['Score']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [32]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
])


Hyperparameter tuning


In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

param_grid = {
    'preprocessor__text__max_df': [0.7, 1.0],
    'preprocessor__text__min_df': [1, 5],
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__penalty': ['l2'],
    'clf__solver': ['lbfgs']
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=2
)
grid.fit(X_train, y_train)


Fitting 5 folds for each of 16 candidates, totalling 80 fits


Evaluation

In [34]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.preprocessing import label_binarize

best_model = grid.best_estimator_  # or rf_grid.best_estimator_ for Random Forest

y_pred = best_model.predict(X_test)

print(classification_report(y_test, y_pred))

# For multiclass ROC AUC
y_test_bin = label_binarize(y_test, classes=sorted(y.unique()))
y_prob = best_model.predict_proba(X_test)
roc_auc = roc_auc_score(y_test_bin, y_prob, average='macro', multi_class='ovr')
print('Macro ROC-AUC:', roc_auc)


              precision    recall  f1-score   support

           1       0.49      0.47      0.48        40
           2       0.22      0.20      0.21        60
           3       0.48      0.42      0.45       100
           4       0.50      0.49      0.50       120
           5       0.49      0.61      0.54        80

    accuracy                           0.45       400
   macro avg       0.43      0.44      0.44       400
weighted avg       0.45      0.45      0.45       400

Macro ROC-AUC: 0.7298508569677872


In [35]:
import joblib

# Save the trained imbalanced pipeline
joblib.dump(pipeline, 'model_B_imbalanced_pipeline.pkl')

print("✅ Imbalanced model pipeline saved as 'model_B_imbalanced_pipeline.pkl'")


✅ Imbalanced model pipeline saved as 'model_B_imbalanced_pipeline.pkl'
