In [6]:
!pip install requests streamlit
""" requests
streamlit
geopandas==1.1.1
gradio==5.49.1
gradio_client==1.13.3
numpy==2.0.2
pandas==2.2.2
pandas-datareader==0.10.0
pandas-gbq==0.30.0
pandas-stubs==2.2.2.240909
requests==2.32.4
requests-oauthlib==2.0.0
requests-toolbelt==1.0.0
scikit-learn==1.6.1
sklearn-pandas==2.2.0
xgboost==3.1.1
"""

Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.51.0-py3-none-any.whl (10.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m10.2/10.2 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.51.0


' requests\nstreamlit\ngeopandas==1.1.1\ngradio==5.49.1\ngradio_client==1.13.3\nnumpy==2.0.2\npandas==2.2.2\npandas-datareader==0.10.0\npandas-gbq==0.30.0\npandas-stubs==2.2.2.240909\nrequests==2.32.4\nrequests-oauthlib==2.0.0\nrequests-toolbelt==1.0.0\nscikit-learn==1.6.1\nsklearn-pandas==2.2.0\nxgboost==3.1.1\n'

In [46]:

# --- 1. Define Constants and Helper Functions ---
MODEL_DIR = "trained_models"

# The models dictionary maps user-friendly names to their file paths
MODEL_FILES = {
    r"Logistic Regression (Balanced)": os.path.join(MODEL_DIR, 'logistic_regression_balanced.joblib'),
    r"Random Forest (Balanced)": os.path.join(MODEL_DIR, 'random_forest_balanced.joblib'),
    r"Gradient Boosting": os.path.join(MODEL_DIR, 'gradient_boosting.joblib'),
    r"XGBoost (Scaled)": os.path.join(MODEL_DIR, 'xgboost_scaled.joblib'),
}
VECTORIZER_FILE = os.path.join(MODEL_DIR, 'tfidf_vectorizer.joblib')


# Example job postings for easy testing
GOOD_EXAMPLE = """Job Title: Senior Data Scientist
Company: TechInnovate Solutions
Description:
We are seeking a highly skilled Senior Data Scientist to join our dynamic R&D team. You will be responsible for designing and implementing advanced machine learning models to solve complex business problems, focusing on predictive maintenance and customer churn. This role requires expertise in Python, scikit-learn, TensorFlow/PyTorch, and distributed computing (Spark). The team is collaborative, focusing on technical excellence and real-world impact.
Requirements:
- PhD or Master's degree in Computer Science, Statistics, or a related quantitative field.
- 5+ years of professional experience in data science or machine learning engineering.
- Strong communication skills to present complex findings to non-technical stakeholders.
Benefits:
Competitive salary, full health/dental/vision coverage, 401(k) matching, flexible working hours, and a generous annual learning stipend."""

BAD_EXAMPLE = """Job Title: Online Earning Opportunity - High Pay, No Experience Needed!
Company: Global Synergy Wealth Creators
Description:
Tired of your 9-to-5? We offer a revolutionary system for high-volume income generation from the comfort of your home. No prior experience is necessary, and training is provided *free of charge*! Earn $500 to $5000 weekly just by processing simple online requests. This is not a sales job, but requires a high degree of motivation and secrecy. We just need motivated individuals ready to start immediately with minimal oversight.
Requirements:
- Must be 18 or older.
- Must provide a $99 'Administrative Setup Fee' upon acceptance to cover initial materials and background check costs.
- Must have reliable internet access and a bank account for direct deposits.
Contact us immediately to secure your spot! Limited positions available."""

# The text cleaning function MUST be the exact same as used during training!
def clean_text(text):
    text = re.sub('<[^>]*>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keep only alphanumeric and spaces
    return text.lower()

# --- 2. Load Models and Vectorizer (using Streamlit's caching) ---
# Caching is crucial in Streamlit. It loads the large models only once.
@st.cache_resource
def load_assets():
    """Loads the TF-IDF Vectorizer and all trained models.
    Exits the script if critical files are not found, printing to the terminal."""
    try:
        # Load the vectorizer
        print(VECTORIZER_FILE)
        vectorizer = joblib.load(VECTORIZER_FILE)

        # Load all models
        loaded_models = {}
        print(MODEL_FILES)
        for name, path in MODEL_FILES.items():
            loaded_models[name] = joblib.load(path)

        return vectorizer, loaded_models
    except FileNotFoundError as e:
        # Print to terminal and exit the script
        print(f"FATAL ERROR: Model file not found. Please ensure the '{MODEL_DIR}' directory with all .joblib files exists.")
        print(f"Missing file: {e.filename}")
        sys.exit(1)
    except Exception as e:
        print(f"An unexpected error occurred while loading assets: {e}")
        sys.exit(1)





In [47]:
VECTORIZER_FILE = os.path.join(MODEL_DIR, r"/content/trained_models/tfidf_vectorizer.joblib")


In [52]:
# --- 1. Define Constants and Helper Functions ---
MODEL_DIR = r"/content/trained_models"
MODEL_FILES = {
    "Logistic Regression (Balanced)": os.path.join(MODEL_DIR, 'logistic_regression_balanced.joblib'),
    "Random Forest (Balanced)": os.path.join(MODEL_DIR, 'random_forest_balanced.joblib'),
    "Gradient Boosting": os.path.join(MODEL_DIR, 'gradient_boosting.joblib'),
    "XGBoost (Scaled)": os.path.join(MODEL_DIR, 'xgboost_scaled.joblib'),
}

AttributeError: module 'os' has no attribute 'getc'

In [53]:
# Load the assets
vectorizer, loaded_models = load_assets()


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



/content/trained_models/tfidf_vectorizer.joblib
{'Logistic Regression (Balanced)': '/content/trained_models/logistic_regression_balanced.joblib', 'Random Forest (Balanced)': '/content/trained_models/random_forest_balanced.joblib', 'Gradient Boosting': '/content/trained_models/gradient_boosting.joblib', 'XGBoost (Scaled)': '/content/trained_models/xgboost_scaled.joblib'}
FATAL ERROR: Model file not found. Please ensure the '/content/trained_models' directory with all .joblib files exists.
Missing file: /content/trained_models/logistic_regression_balanced.joblib
Traceback (most recent call last):
  File "/tmp/ipython-input-2033521208.py", line 57, in load_assets
    loaded_models[name] = joblib.load(path)
                          ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/joblib/numpy_pickle.py", line 735, in load
    with open(filename, "rb") as f:
         ^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '/content/trained_models/logis

TypeError: object of type 'NoneType' has no len()

In [15]:
vectorizer

In [8]:

import streamlit as st
import joblib
import re
import os
import sys # Import sys to allow clean exiting

# --- 1. Define Constants and Helper Functions ---
MODEL_DIR = "trained_models"

# The models dictionary maps user-friendly names to their file paths
MODEL_FILES = {
    "Logistic Regression (Balanced)": os.path.join(MODEL_DIR, 'logistic_regression_balanced.joblib'),
    "Random Forest (Balanced)": os.path.join(MODEL_DIR, 'random_forest_balanced.joblib'),
    "Gradient Boosting": os.path.join(MODEL_DIR, 'gradient_boosting.joblib'),
    "XGBoost (Scaled)": os.path.join(MODEL_DIR, 'xgboost_scaled.joblib'),
}
VECTORIZER_FILE = os.path.join(MODEL_DIR, 'tfidf_vectorizer.joblib')

# Example job postings for easy testing
GOOD_EXAMPLE = """Job Title: Senior Data Scientist
Company: TechInnovate Solutions
Description:
We are seeking a highly skilled Senior Data Scientist to join our dynamic R&D team. You will be responsible for designing and implementing advanced machine learning models to solve complex business problems, focusing on predictive maintenance and customer churn. This role requires expertise in Python, scikit-learn, TensorFlow/PyTorch, and distributed computing (Spark). The team is collaborative, focusing on technical excellence and real-world impact.
Requirements:
- PhD or Master's degree in Computer Science, Statistics, or a related quantitative field.
- 5+ years of professional experience in data science or machine learning engineering.
- Strong communication skills to present complex findings to non-technical stakeholders.
Benefits:
Competitive salary, full health/dental/vision coverage, 401(k) matching, flexible working hours, and a generous annual learning stipend."""

BAD_EXAMPLE = """Job Title: Online Earning Opportunity - High Pay, No Experience Needed!
Company: Global Synergy Wealth Creators
Description:
Tired of your 9-to-5? We offer a revolutionary system for high-volume income generation from the comfort of your home. No prior experience is necessary, and training is provided *free of charge*! Earn $500 to $5000 weekly just by processing simple online requests. This is not a sales job, but requires a high degree of motivation and secrecy. We just need motivated individuals ready to start immediately with minimal oversight.
Requirements:
- Must be 18 or older.
- Must provide a $99 'Administrative Setup Fee' upon acceptance to cover initial materials and background check costs.
- Must have reliable internet access and a bank account for direct deposits.
Contact us immediately to secure your spot! Limited positions available."""

# The text cleaning function MUST be the exact same as used during training!
def clean_text(text):
    text = re.sub('<[^>]*>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Keep only alphanumeric and spaces
    return text.lower()

# --- 2. Load Models and Vectorizer (using Streamlit's caching) ---
# Caching is crucial in Streamlit. It loads the large models only once.
@st.cache_resource
def load_assets():
    """Loads the TF-IDF Vectorizer and all trained models.
    Exits the script if critical files are not found, printing to the terminal."""
    try:
        # Load the vectorizer
        vectorizer = joblib.load(VECTORIZER_FILE)

        # Load all models
        loaded_models = {}
        for name, path in MODEL_FILES.items():
            loaded_models[name] = joblib.load(path)

        return vectorizer, loaded_models
    except FileNotFoundError as e:
        # Print to terminal and exit the script
        print(f"FATAL ERROR: Model file not found. Please ensure the '{MODEL_DIR}' directory with all .joblib files exists.")
        print(f"Missing file: {e.filename}")
        sys.exit(1)
    except Exception as e:
        print(f"An unexpected error occurred while loading assets: {e}")
        sys.exit(1)

# Load the assets
vectorizer, loaded_models = load_assets()
loaded_modelsLengt = len(loaded_models)
print("the model Length is {loaded_modelsLengt}")
# --- 3. Prediction Function ---
def predict_fraud(job_description, algorithm_name):
    """Predicts fraudulence using the selected algorithm."""
    # 1. Select the model
    model = loaded_models.get(algorithm_name)
    if model is None:
        return "Error: Selected model not loaded."

    # 2. Preprocess text (MUST match training preprocessing)
    cleaned_desc = clean_text(job_description)

    # 3. Vectorize text
    vect = vectorizer.transform([cleaned_desc])

    # 4. Predict
    pred = model.predict(vect)[0]

    # Handle models without predict_proba (though rare for these types)
    try:
        prob = model.predict_proba(vect)[0]
    except AttributeError:
        # Fallback if no probability is available
        return f"Prediction: {'Fraudulent' if pred == 1 else 'Authentic'}", "Confidence score not available for this model.", "red" if pred == 1 else "green"

    # 5. Format and return result
    if pred == 1:
        # Confidence for the positive class (1: Fraudulent)
        confidence = prob[1] * 100
        result = f"‚ö†Ô∏è FRAUDULENT POSTING DETECTED"
        detail = f"The {algorithm_name} model predicts this post is **Fraudulent** with a **{confidence:.2f}%** confidence."
    else:
        # Confidence for the negative class (0: Authentic)
        confidence = prob[0] * 100
        result = f"‚úÖ AUTHENTIC JOB POSTING"
        detail = f"The {algorithm_name} model predicts this post is **Authentic** with a **{confidence:.2f}%** confidence."

    # We return the necessary components for the Streamlit UI to display
    return result, detail, "red" if pred == 1 else "green"

# --- 4. Streamlit App Interface ---
st.set_page_config(
    page_title="Fraud Job Post Detector",
    page_icon="üïµÔ∏è",
    layout="wide")

# Add custom CSS for white background and black text
st.markdown("""
<style>
.stApp {
    background-color: #FFFFFF; /* White background */
    color: #000000; /* Default text color to black */
}
/* Ensure main content elements also have black text */
h1, h2, h3, h4, .stMarkdown, .stSelectbox label {
    color: #000000 !important;
}
/* Ensure the text area input field background is light */
.stTextArea textarea,
.stTextInput input {
    background-color: #F0F2F6; /* Light gray input background */
    color: #000000;
}
.stTextArea label,
.stTextInput label {
    color: #333333 !important; /* Dark gray label text */
}
</style>
""", unsafe_allow_html=True)


st.title("üïµÔ∏è Fraud Job Post Detector")
st.markdown("### A Machine Learning Tool for Analyzing Job Text")

# Initialize session state for the text area if not present
if 'job_text' not in st.session_state:
    st.session_state.job_text = ""

# Add Expander for Examples to drive interaction
with st.expander("üí° Click here to load example job postings"):
    st.markdown("Use these examples to see how the models distinguish between legitimate and suspicious language.")
    col_ex_1, col_ex_2 = st.columns(2)

    if col_ex_1.button("Load Authentic Example", type="secondary", use_container_width=True):
        st.session_state.job_text = GOOD_EXAMPLE
    if col_ex_2.button("Load Fraudulent Example", type="secondary", use_container_width=True):
        st.session_state.job_text = BAD_EXAMPLE

    st.markdown("---")
    st.code(st.session_state.job_text, language='markdown')

# --- Interface Setup ---
col1, col2 = st.columns([3, 1])
with col1:
    job_text = st.text_area(
        "Paste Job Description Text Here",
        key='job_text', # Link to session state
        height=300,
        placeholder="Paste job title, company profile, description, requirements, and benefits here..."
    )

with col2:
    algorithm_choice = st.selectbox(
        "Select Algorithm",
        options=list(MODEL_FILES.keys()),
        index=list(MODEL_FILES.keys()).index('XGBoost (Scaled)') if 'XGBoost (Scaled)' in MODEL_FILES else 0, # Default to XGBoost if available
        help="Choose the trained model you want to use for prediction."
    )
    st.markdown("### Prediction Result")

    # Single Button Logic:
    if st.button('Predict Fraudulence', type="primary", key='predict_main_btn'):
        if job_text:
            # Perform prediction
            with st.spinner('Analyzing text and calculating prediction...'):
                result, detail, color = predict_fraud(job_text, algorithm_choice)

            # Display results with dynamic styling
            st.markdown(f"""
            <div style='background-color: {'#f8d7da' if color == 'red' else '#d4edda'};
                         color: {'#721c24' if color == 'red' else '#155724'};
                         padding: 15px;
                         border-radius: 10px;
                         border: 2px solid {'#f5c6cb' if color == 'red' else '#c3e6cb'};'>
                <h3>{result}</h3>
            </div>
            """, unsafe_allow_html=True)

            # Display the confidence detail
            st.markdown(f"**{detail}**")
        else:
            st.warning("Please paste some job text to analyze.")

# --- Footer Information ---
st.markdown("---")
st.info(
    "**Technical Note:** The models (Logistic Regression, Random Forest, Gradient Boosting, XGBoost) and the TF-IDF Vectorizer are loaded from the "
    f"**`{MODEL_DIR}`** directory using `joblib.load()` for fast deployment. The models are trained on textual features extracted from job posting datasets.")

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



FATAL ERROR: Model file not found. Please ensure the 'trained_models' directory with all .joblib files exists.
Missing file: trained_models/logistic_regression_balanced.joblib
Traceback (most recent call last):
  File "/tmp/ipython-input-3855799344.py", line 60, in load_assets
    loaded_models[name] = joblib.load(path)
                          ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/joblib/numpy_pickle.py", line 735, in load
    with open(filename, "rb") as f:
         ^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'trained_models/logistic_regression_balanced.joblib'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-3855799344.py", line 73, in <cell line: 0>
    vectorizer, loaded_models = l

TypeError: object of type 'NoneType' has no len()