In [None]:
import pandas as pd
 
# Load the CSV (adjust the path if needed)
df = pd.read_csv('fraud_jobs_train.csv')
 
# Show first 5 rows
df.head()

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(subset=['description'])

In [None]:
text_columns = ['company_profile', 'benefits', 'requirements']
 
for col in text_columns:
    df[col] = df[col].fillna('')

In [None]:
df['text'] = df['title'] + ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits']

In [None]:
df['text'].iloc[0]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
 
# Create the vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
 
# Transform the text column into numbers
X = vectorizer.fit_transform(df['text'])

In [None]:
y = df['fraudulent']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # Fraud probability
 
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
import pandas as pd
 
# Convert X_test (sparse) to DataFrame
X_test_dense = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())
 
# Add predictions and probabilities
X_test_dense["predicted_label"] = y_pred
X_test_dense["fraud_probability"] = y_proba
 
# Save to CSV
X_test_dense.to_csv("predictions.csv", index=False)
 
X_test_dense.head()

In [None]:
from sklearn.model_selection import GridSearchCV
 
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20, 50],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced']
}
 
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    scoring='f1',
    cv=3,
    n_jobs=-1
)
 
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)
f1_score(y_test, y_pred)

In [None]:
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

In [None]:
# Save the indices of the original dataframe
original_indices = df.index

In [None]:
from sklearn.model_selection import train_test_split
 
# Split X and y as usual, but also split original indices
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X, y, original_indices, test_size=0.2, stratify=y, random_state=42
)

In [None]:
final_df = pd.DataFrame()
final_df["Job_Title"] = df.loc[idx_test, "title"].values
final_df["Company"] = df.loc[idx_test, "company_profile"].values
final_df["Description"] = df.loc[idx_test, "description"].values
final_df["Fraud_Probability"] = y_proba
final_df["Predicted_Label"] = y_pred

In [None]:
final_df.head()

In [None]:
final_df.to_csv("job_fraud_predictions.csv", index=False)

In [None]:
!pip install wordcloud

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=final_df, x="Predicted_Label", palette=["green", "red"])
plt.title("Predicted Job Posting Types")
plt.xlabel("Label (0 = Real, 1 = Fraud)")
plt.ylabel("Count")
plt.xticks([0, 1], ["Real", "Fraudulent"])
plt.show()

In [None]:
sns.countplot(data=final_df, x="Predicted_Label", hue="Predicted_Label", palette=["green", "red"], legend=False)

In [None]:
top_frauds = final_df.sort_values(by="Fraud_Probability", ascending=False).head(10)
 
plt.figure(figsize=(10, 6))
sns.barplot(x="Fraud_Probability", y="Job_Title", data=top_frauds, palette="Reds_r")
plt.title("Top 10 Most Suspicious Job Postings")
plt.xlabel("Fraud Probability")
plt.ylabel("Job Title")
plt.show()

In [None]:
# Filter fraudulent jobs
fraud_jobs = final_df[final_df["Predicted_Label"] == 1]
 
# Join all descriptions into one big string
text = " ".join(fraud_jobs["Description"].dropna().astype(str).values)
 
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
 
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Common Words in Fraudulent Job Descriptions")
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
 
cm = confusion_matrix(y_test, y_pred)
 
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=["Real", "Fraud"], yticklabels=["Real", "Fraud"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
import joblib
 
joblib.dump(model, "random_forest_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")