In [None]:
import numpy as np
import tensorflow as tf
import torch
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import pandas as pd
# Load Dataset
data = pd.read_csv("fake_job_postings.csv")  # Replace with your dataset path

# Preprocessing
text_columns = ['title', 'company_profile', 'description', 'requirements', 'benefits']
data[text_columns] = data[text_columns].fillna('Missing')
data['combined_text'] = data[text_columns].agg(' '.join, axis=1)
# Assuming 'data' is your DataFrame and 'text_columns' is defined
data['combined_text'] = data[text_columns].astype(str).agg(' '.join, axis=1)

# Your code to access 'combined_text' should work now:
X = data['combined_text'].values
y = data['fraudulent'].values

# Assuming you have 'data' DataFrame with 'combined_text' and 'fraudulent' columns
X = data['combined_text'].values
y = data['fraudulent'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Extraction (TF-IDF for traditional ML models)
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Handling Class Imbalance (SMOTE)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

# Traditional ML Models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    print("-" * 20)



Training Logistic Regression...
Logistic Regression Accuracy: 0.9768
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3403
           1       0.70      0.90      0.79       173

    accuracy                           0.98      3576
   macro avg       0.85      0.94      0.89      3576
weighted avg       0.98      0.98      0.98      3576

--------------------
Training Random Forest...
Random Forest Accuracy: 0.9832
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3403
           1       0.98      0.66      0.79       173

    accuracy                           0.98      3576
   macro avg       0.98      0.83      0.89      3576
weighted avg       0.98      0.98      0.98      3576

--------------------
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9874
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3403
           1       0.95      0.78      0.86       173

    accuracy                           0.99      3576
   macro avg       0.97      0.89      0.93      3576
weighted avg       0.99      0.99      0.99      3576

--------------------


In [None]:
# ===================== BiLSTM Model =====================

# Tokenization and Padding
max_words = 5000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

# Resampling with SMOTE
X_train_seq_resampled, y_train_resampled = smote.fit_resample(X_train_seq, y_train)

# Define BiLSTM Model
bilstm_model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
bilstm_model.fit(X_train_seq_resampled, y_train_resampled, epochs=5, batch_size=32, validation_data=(X_test_seq, y_test))

# Evaluate BiLSTM
y_pred_bilstm = (bilstm_model.predict(X_test_seq) > 0.5).astype("int32")
accuracy_bilstm = accuracy_score(y_test, y_pred_bilstm)
print(f"BiLSTM Accuracy: {accuracy_bilstm:.4f}")
print(classification_report(y_test, y_pred_bilstm))





Epoch 1/5
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m434s[0m 498ms/step - accuracy: 0.8940 - loss: 0.2653 - val_accuracy: 0.9519 - val_loss: 0.1568
Epoch 2/5
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m418s[0m 491ms/step - accuracy: 0.9813 - loss: 0.0648 - val_accuracy: 0.9720 - val_loss: 0.1106
Epoch 3/5
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m444s[0m 494ms/step - accuracy: 0.9820 - loss: 0.0598 - val_accuracy: 0.9687 - val_loss: 0.1225
Epoch 4/5
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m441s[0m 493ms/step - accuracy: 0.9877 - loss: 0.0428 - val_accuracy: 0.9664 - val_loss: 0.1189
Epoch 5/5
[1m851/851[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 493ms/step - accuracy: 0.9919 - loss: 0.0253 - val_accuracy: 0.9609 - val_loss: 0.1391
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 127ms/step
BiLSTM Accuracy: 0.9609
              precision    recall  f1-score   support

           0      

In [None]:
# ===================== BERT Model =====================

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenizing Text for BERT
X_train_encodings = bert_tokenizer(list(X_train), truncation=True, padding=True, max_length=200, return_tensors="pt")
X_test_encodings = bert_tokenizer(list(X_test), truncation=True, padding=True, max_length=200, return_tensors="pt")

train_dataset = TensorDataset(X_train_encodings['input_ids'], torch.tensor(y_train))
test_dataset = TensorDataset(X_test_encodings['input_ids'], torch.tensor(y_test))

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load Pretrained BERT Model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
bert_model.to(device)

# Define Optimizer and Loss
optimizer = AdamW(bert_model.parameters(), lr=2e-5)

# Training Loop
epochs = 3
bert_model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)
        outputs = bert_model(input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

# Evaluate BERT
bert_model.eval()
predictions = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        outputs = bert_model(input_ids)
        preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        predictions.extend(preds)

accuracy_bert = accuracy_score(y_test, predictions)
print(f"BERT Accuracy: {accuracy_bert:.4f}")
print(classification_report(y_test, predictions))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-1b9e95072fbb>", line 32, in <cell line: 0>
    outputs = bert_model(input_ids, labels=labels)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/models/bert/modeling_bert.py", line 1665, in forward
    outputs = self.bert(
              ^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1736, i

TypeError: object of type 'NoneType' has no len()

In [None]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from scipy.stats import mode
# Predictions for traditional models
y_pred_log = log_reg.predict(X_test_tfidf)
y_pred_rf = random_forest.predict(X_test_tfidf)
y_pred_xgb = xgboost.predict(X_test_tfidf)

# BiLSTM Predictions
y_pred_bilstm = (bilstm_model.predict(X_test_seq) > 0.5).astype("int32").flatten()
#Bert Prediction
y_pred_bert = np.array(predictions)

# ===================== Calculate Model Accuracies =====================
acc_log = accuracy_score(y_test, y_pred_log)
acc_rf = accuracy_score(y_test, y_pred_rf)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
acc_bilstm = accuracy_score(y_test, y_pred_bilstm)
acc_bert = accuracy_score(y_test, y_pred_bert)

# Print individual model accuracies
print(f"Logistic Regression Accuracy: {acc_log:.4f}")
print(f"Random Forest Accuracy: {acc_rf:.4f}")
print(f"XGBoost Accuracy: {acc_xgb:.4f}")
print(f"BiLSTM Accuracy: {acc_bilstm:.4f}")
print(f"BERT Accuracy: {acc_bert:.4f}")

# Assign weights based on accuracy (Higher Accuracy = Higher Weight)
weights = np.array([acc_log, acc_rf, acc_xgb, acc_bilstm, acc_bert])

# Normalize weights to sum to 1
weights = weights / weights.sum()

# Combine predictions into a matrix
ensemble_predictions = np.array([
    y_pred_log,
    y_pred_rf,
    y_pred_xgb,
    y_pred_bilstm,
    y_pred_bert
])

# ===================== Weighted Voting =====================
# Weighted Sum Voting
weighted_votes = np.zeros_like(y_pred_log, dtype=float)

for i, model_preds in enumerate(ensemble_predictions):
    weighted_votes += model_preds * weights[i]

# Convert weighted predictions to final binary predictions (Threshold = 0.5)
final_predictions = (weighted_votes >= 0.5).astype(int)

# ===================== Final Evaluation =====================
final_accuracy = accuracy_score(y_test, final_predictions)
print("\n🔹 **Weighted Ensemble Model Accuracy:** {:.4f}".format(final_accuracy))
print("\n🔹 **Classification Report:**\n", classification_report(y_test, final_predictions))

# Print Final Ensemble Predictions
print("\n🔹 **Final Ensemble Predictions:**\n", final_predictions)


NameError: name 'log_reg' is not defined