In [6]:
import os
os.makedirs("app/model", exist_ok=True)


In [9]:
import os
import pandas as pd
import joblib
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from scipy.sparse import hstack, csr_matrix

# 📥 Load data
data = pd.read_csv("../data/processed/balanced_data.csv")
X = data.drop("fraudulent", axis=1)
y = data["fraudulent"]

# 🧪 Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# 🔍 Identify columns
possible_text_cols = ['description', 'company_profile', 'requirements', 'benefits']
text_cols = [col for col in possible_text_cols if col in X.columns]
non_text_cols = [col for col in X.columns if col not in text_cols]

# Numeric and Categorical
numeric_cols = X_train[non_text_cols].select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X_train[non_text_cols].select_dtypes(include=['object']).columns.tolist()

# 🧼 Pipelines
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, numeric_cols),
    ("cat", cat_pipeline, categorical_cols)
])

# Fit transform on tabular data
print("🔧 Preprocessing tabular data...")
X_train_tabular = preprocessor.fit_transform(X_train[non_text_cols])
X_test_tabular = preprocessor.transform(X_test[non_text_cols])

# 🧠 TF-IDF on text columns
print("🔧 TF-IDF vectorization...")
text_vectors_train = []
text_vectors_test = []
tfidf_vectorizers = {}

for col in text_cols:
    tfidf = TfidfVectorizer(max_features=200)
    vec_train = tfidf.fit_transform(X_train[col].fillna(""))
    vec_test = tfidf.transform(X_test[col].fillna(""))
    text_vectors_train.append(vec_train)
    text_vectors_test.append(vec_test)
    tfidf_vectorizers[col] = tfidf

X_train_text = hstack(text_vectors_train)
X_test_text = hstack(text_vectors_test)

# 🔗 Final combined features
X_train_final = hstack([csr_matrix(X_train_tabular), X_train_text])
X_test_final = hstack([csr_matrix(X_test_tabular), X_test_text])

# 🤖 Base Models (Optimized for Speed)
print("🚀 Building models...")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, max_depth=4)
cat = CatBoostClassifier(verbose=0, iterations=100, depth=4)

# 🔼 Meta model
lr = LogisticRegression(max_iter=1000)

# 🤝 Stacking without passthrough
stack_model = StackingClassifier(
    estimators=[('xgb', xgb), ('cat', cat)],
    final_estimator=lr,
    passthrough=False,
    n_jobs=-1
)

# 🚀 Train
print("🧠 Training stacking model...")
stack_model.fit(X_train_final, y_train)

# 🧪 Evaluate
print("\n📊 Classification Report:\n")
y_pred = stack_model.predict(X_test_final)
print(classification_report(y_test, y_pred))
print("✅ Accuracy:", accuracy_score(y_test, y_pred))

# 💾 Save models
print("\n💾 Saving models...")
os.makedirs("app/model", exist_ok=True)
joblib.dump(stack_model, "app/model/stacking_model.pkl")
joblib.dump(preprocessor, "app/model/preprocessor.pkl")
joblib.dump(tfidf_vectorizers, "app/model/tfidf_vectorizers.pkl")
print("✅ Models saved to 'app/model/'")


🔧 Preprocessing tabular data...
🔧 TF-IDF vectorization...
🚀 Building models...
🧠 Training stacking model...

📊 Classification Report:

              precision    recall  f1-score   support

           0       0.98      0.99      0.99       453
           1       0.99      0.98      0.99       453

    accuracy                           0.99       906
   macro avg       0.99      0.99      0.99       906
weighted avg       0.99      0.99      0.99       906

✅ Accuracy: 0.9867549668874173

💾 Saving models...
✅ Models saved to 'app/model/'


In [6]:
print(X_train.columns.tolist())


['title', 'location', 'department', 'description', 'requirements', 'employment_type', 'required_experience', 'required_education', 'industry', 'function', 'all_text', 'textblob_polarity', 'vader_neg', 'vader_neu', 'vader_pos', 'vader_compound', 'keyword_money', 'keyword_earn', 'keyword_click', 'keyword_investment', 'keyword_urgent', 'keyword_opportunity', 'keyword_work from home']


In [2]:
# Save the list of features used in training
import json
with open("app/model/features.json", "w") as f:
    json.dump(list(X_train.columns), f)

# Save label encoders if any were used
joblib.dump(label_encoders, "app/model/label_encoders.pkl")


['app/model/label_encoders.pkl']