In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [2]:
# Load cleaned resume/job descriptions
df = pd.read_csv("../data/cleaned_fake_jobs.csv")

In [3]:
# Drop empty descriptions
df = df.dropna(subset=["cleaned_desc"])
df = df[df["cleaned_desc"].str.strip() != ""]
df = df.reset_index(drop=True)


In [4]:
texts = df["cleaned_desc"].tolist()
labels = df["fraudulent"].tolist()

In [5]:
# TF-IDF Encoding (ABACUS)
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(texts).toarray()

In [6]:
# Padding for fixed shape
X_padded = pad_sequences(X, maxlen=250, padding="post", truncating="post")
y = np.array(labels)


In [7]:
# Check shape match
assert X_padded.shape[0] == y.shape[0], "Mismatch in input/output sizes!"

In [8]:
# Save feature matrix and labels
np.save("../data/abacus_features.npy", X_padded)
np.save("../data/abacus_labels.npy", y)

In [9]:
with open("../model/tokenizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("ABACUS matrix and tokenizer saved.")

ABACUS matrix and tokenizer saved.
