In [3]:
import data

X_train, X_test, y_train, y_test, label_dict = data.load_data()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE # type: ignore

In [None]:
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense, Dropout # type: ignore

def evaluate_model(y_test, y_pred):
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))

def build_ann(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

vectorizer = CountVectorizer(ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_vec, y_train)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(X_test_vec)

evaluate_model(y_test, y_pred)

In [None]:
from xgboost import XGBClassifier

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_vec, y_train)

clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(X_test_vec)

evaluate_model(y_test, y_pred)

In [None]:
from xgboost import XGBClassifier

vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_vec, y_train)

clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
clf.fit(X_resampled, y_resampled)
y_pred = clf.predict(X_test_vec)

evaluate_model(y_test, y_pred)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

from scipy.sparse import csr_matrix

In [None]:
vectorizers = {
    'CountVectorizer': CountVectorizer(ngram_range=(1, 2)),
    'TfidfVectorizer': TfidfVectorizer(ngram_range=(1, 2))
}

results = {}

for name, vectorizer in vectorizers.items():
    print(f"Using {name}")
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    
    smote = SMOTE(random_state=42)
    under_sampler = RandomUnderSampler(random_state=42)
    smote_tomek = SMOTETomek(random_state=42)
    
    resampling_methods = { 
        'Original': (X_train_vec, y_train),
        'SMOTE': smote.fit_resample(X_train_vec, y_train),
        'Undersample': under_sampler.fit_resample(X_train_vec, y_train),
        'SMOTE+Tomek': smote_tomek.fit_resample(X_train_vec, y_train)
    }
    
    classifiers = {
        'Random Forest': RandomForestClassifier(random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'LightGBM': LGBMClassifier(random_state=42),
        'Logistic Regression': LogisticRegression(max_iter=1000)
    }
    
    for method_name, (X_resampled, y_resampled) in resampling_methods.items():
        print(f"Resampling: {method_name}")
        for clf_name, clf in classifiers.items():
            print(f"Training: {clf_name}")
            if clf_name == 'LightGBM':
                X_resampled = csr_matrix(X_resampled.astype('float32'))
                X_test_vec = csr_matrix(X_test_vec.astype('float32'))

            clf.fit(X_resampled, y_resampled)
            y_pred = clf.predict(X_test_vec)
            report = classification_report(y_test, y_pred, output_dict=True)
            results[(name, method_name, clf_name)] = report

def build_ann(input_dim):
    model = Sequential([
        Dense(128, activation='relu', input_dim=input_dim),
        Dropout(0.5),
        Dense(64, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

X_train_vec = vectorizers['TfidfVectorizer'].fit_transform(X_train)
X_test_vec = vectorizers['TfidfVectorizer'].transform(X_test)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_vec, y_train)

ann = build_ann(X_resampled.shape[1])
history = ann.fit(X_resampled.toarray(), y_resampled, validation_split=0.2, epochs=10, batch_size=32)

y_pred_ann = (ann.predict(X_test_vec.toarray()) > 0.5).astype(int)
print(classification_report(y_test, y_pred_ann))

plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()