In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
import pickle

In [16]:
with open("../output_preprocessing/positive.pkl", "rb") as f:   # rb = read-binary
    positive = pickle.load(f)

with open("../output_preprocessing/negative_filtered.pkl", "rb") as f:   # rb = read-binary
    negative = pickle.load(f)


processed_data = positive+negative
labels = len(positive)*[1]+len(negative)*[0]

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# --- Example inputs ---
# processed_data = pd.Series(["good movie", "bad film", "excellent acting"])  # tokenized/preprocessed text
# labels = [1, 0, 1]  # binary labels

# --- Vectorize text ---
vectorizer = TfidfVectorizer()   # or CountVectorizer(), HashingVectorizer(), etc.
X = vectorizer.fit_transform(processed_data)

# --- Train/test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

# --- Define classifiers ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "SVM": SVC(probability=True),
    "Naive Bayes": MultinomialNB(),      # suited for text
    "KNN": KNeighborsClassifier()
}

results = {}

# --- Evaluate each model ---
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Some classifiers (like SVC without probability=True) don’t support predict_proba
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None

    results[name] = {"Accuracy": acc, "ROC_AUC": auc}

# --- Display results ---
results_df = pd.DataFrame(results).T
print(results_df)


                     Accuracy   ROC_AUC
Logistic Regression  0.868413  0.936861
Random Forest        0.864546  0.932067
Gradient Boosting    0.870057  0.938583
SVM                  0.873151  0.940325
Naive Bayes          0.850624  0.917716
KNN                  0.725902  0.840479


In [14]:
len(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3422321 stored elements and shape (41368, 120818)>

In [19]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report


# Train-test split
X_train, X_test, y_train, y_test = train_test_split(processed_data, labels, test_size=0.2, random_state=42)

# -------------------------------
# 1. Logistic Regression Pipeline
# -------------------------------
lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('clf', LogisticRegression(max_iter=2000, solver='liblinear'))
])

lr_params = {
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)],
    'tfidf__min_df': [1, 2, 5],
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__penalty': ['l1', 'l2']
}

lr_grid = GridSearchCV(lr_pipeline, lr_params, cv=5, scoring='accuracy', n_jobs=-1)
lr_grid.fit(X_train, y_train)

print("Best Logistic Regression params:", lr_grid.best_params_)
print("Logistic Regression test performance:")
print(classification_report(y_test, lr_grid.predict(X_test)))


Best Logistic Regression params: {'clf__C': 10, 'clf__penalty': 'l2', 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}
Logistic Regression test performance:
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      5962
           1       0.83      0.86      0.85      4381

    accuracy                           0.87     10343
   macro avg       0.86      0.87      0.86     10343
weighted avg       0.87      0.87      0.87     10343



In [None]:
# -------------------------------
# 2. Support Vector Classifier
# -------------------------------
svc_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC())
])

svc_params = {
    'tfidf__ngram_range': [(1,1), (1,2),(1,3)],
    'tfidf__min_df': [1, 2, 5],
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['linear', 'rbf'],
    'clf__gamma': ['scale', 'auto']
}

svc_grid = GridSearchCV(svc_pipeline, svc_params, cv=5, scoring='accuracy', n_jobs=-1)
svc_grid.fit(X_train, y_train)

print("Best SVC params:", svc_grid.best_params_)
print("SVC test performance:")
print(classification_report(y_test, svc_grid.predict(X_test)))


