In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [4]:
df = pd.read_csv("../data/cleaned_fake_jobs.csv")


In [5]:
print(df.shape)
df.head()

(17880, 2)


Unnamed: 0,cleaned_desc,fraudulent
0,foodafastgrowingjamesbeardawardwinningonlinefo...,0
1,organisedfocusedvibrantawesomedoyouhaveapassio...,0
2,ourclientlocatedinhoustonisactivelyseekinganex...,0
3,thecompanyesrienvironmentalsystemsresearchinst...,0
4,jobtitleitemizationreviewmanagerlocationfortwo...,0


In [6]:
df = df.dropna(subset=['cleaned_desc'])

In [7]:
df = df[df['cleaned_desc'].str.strip() != '']

In [8]:
df = df.reset_index(drop=True)

In [9]:
texts = df['cleaned_desc'].tolist()
labels = df['fraudulent'].tolist()

In [10]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(texts).toarray()

In [11]:
X_padded = pad_sequences(X, maxlen=250, padding='post', truncating='post')

In [12]:
y = np.array(labels)

In [13]:
assert X_padded.shape[0] == y.shape[0], f"Mismatch: X={X_padded.shape}, y={y.shape}"

In [14]:
np.save("../data/abacus_features.npy", X_padded)
np.save("../data/abacus_labels.npy", y)

print(" Saved ABACUS input matrix and labels.")

 Saved ABACUS input matrix and labels.


In [15]:
print("ABACUS matrix and labels saved with matching shapes:", X_padded.shape, y.shape)

ABACUS matrix and labels saved with matching shapes: (17878, 250) (17878,)


In [16]:
import pickle

# Save the TF-IDF tokenizer (vectorizer)
with open("../model/tokenizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("Tokenizer (TF-IDF vectorizer) saved at model/tokenizer.pkl")


Tokenizer (TF-IDF vectorizer) saved at model/tokenizer.pkl


In [17]:
# 02_generate_abacus_matrix.ipynb

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
import pickle

# Load cleaned data
df = pd.read_csv("../data/cleaned_fake_jobs.csv")
print("Original shape:", df.shape)

# Drop missing or empty resumes
df = df.dropna(subset=['cleaned_desc'])
df = df[df['cleaned_desc'].str.strip() != '']
df = df.reset_index(drop=True)
print("Cleaned shape:", df.shape)

# Extract text and labels
texts = df['cleaned_desc'].tolist()
labels = df['fraudulent'].tolist()

# Convert to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(texts).toarray()

# Pad to shape (num_samples, 250)
X_padded = pad_sequences(X, maxlen=250, padding='post', truncating='post')

# Save arrays
y = np.array(labels)
assert X_padded.shape[0] == y.shape[0], "Shape mismatch between X and y"

os.makedirs("../data", exist_ok=True)
np.save("../data/abacus_features.npy", X_padded)
np.save("../data/abacus_labels.npy", y)

# Save the TF-IDF vectorizer for later use
os.makedirs("../model", exist_ok=True)
with open("../model/tokenizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("✅ ABACUS matrix and labels saved.")
print("✅ Tokenizer saved as tokenizer.pkl.")
print("📐 X shape:", X_padded.shape, "| y shape:", y.shape)


Original shape: (17880, 2)
Cleaned shape: (17878, 2)
✅ ABACUS matrix and labels saved.
✅ Tokenizer saved as tokenizer.pkl.
📐 X shape: (17878, 250) | y shape: (17878,)
