In [17]:
from flask import Flask, request, jsonify
from flask_cors import CORS
import joblib
import numpy as np
import pandas as pd
from io import StringIO
import threading
import time
import nest_asyncio
import sys

In [18]:
MODLE_FILE_PATH = './random_forest_model.pkl'
EXPECTED_FEATURES = [
    'default_profile_image',
    'statuses_count',
    'followers_count',
    'friends_count',
    'protected',
    'name_length',
    'follower_ratio'
]

RAW_COLUMNS_NEEDED = {
    'statuses_count': 'statuses_count',
    'followers_count': 'followers_count',
    'friends_count': 'friends_count',
    'default_profile_image': 'default_profile_image',
    'protected': 'protected',
    'name': 'name'
}

In [19]:
try: 
    MODEL = joblib.load(MODLE_FILE_PATH)
    print(f"Model {MODLE_FILE_PATH} loaded succesfully.")
except FileNotFoundError:
    print(f"Missing model file: {MODLE_FILE_PATH}")

Model ./random_forest_model.pkl loaded succesfully.


In [20]:
app = Flask(__name__)
CORS(app)

<flask_cors.extension.CORS at 0x25be014ff20>

In [21]:
def assign_risk_category(score):
    if score >= 91:
        return {"category": "CRITICAL RISK", "color": "red"}
    if score >= 66:
        return {"category": "BOT", "color": "amber"}
    if score >= 31:
        return {"category": "MODERATE", "color": "yellow"}
    return {"category": "GENUINE", "color": "green"}

In [22]:
# when re-running cells in a notebook, remove existing view functions to avoid duplicate registration errors
for _ep in ('health','batch_predict'):
    if _ep in app.view_functions:
        app.view_functions.pop(_ep)

@app.route('/health', methods=['GET'])
def health():
    return jsonify({'status': 'ok'}), 200

@app.route('/batch-predict', methods=['POST'])
def batch_predict():
    # basic validations
    if 'file' not in request.files:
        return jsonify({'error': 'No file part in the request'}), 400

    file = request.files['file']
    if file.filename == '' or not file.filename.endswith('.csv'):
        return jsonify({'error': 'No selected file or file is not a CSV'}), 400
    
    raw = file.read()
    try:
        file_content = raw.decode("utf-8")
    except Exception as e:
        file_content = raw.decode("latin1", errors="replace")

    df = pd.read_csv(StringIO(file_content))

    try:
        # normalization and type coercion
        # lowercase the incoming column names to avoid case-mismatch with model features
        df.columns = [c.lower() for c in df.columns]
        df['followers_count'] = pd.to_numeric(df.get('followers_count', 0), errors='coerce').fillna(0)
        df['friends_count'] = pd.to_numeric(df.get('friends_count', 0), errors='coerce').fillna(0)
        df['statuses_count'] = pd.to_numeric(df.get('statuses_count', 0), errors='coerce').fillna(0)
        df['default_profile_image'] = pd.to_numeric(df.get('default_profile_image', 0), errors='coerce').fillna(0)
        df['protected'] = pd.to_numeric(df.get('protected', 0), errors='coerce').fillna(0)
    except Exception as e:
        return jsonify({'error': 'Data Pre-cleaning Failed', 'details': str(e)}), 400

    # ensure columns exist and track completeness
    feature_present_count = 0
    missing_features_list = []
    for expected_raw_col, default_col_name in RAW_COLUMNS_NEEDED.items():
        if default_col_name in df.columns:
            df[default_col_name] = df[default_col_name].fillna(0)
            feature_present_count = feature_present_count + 1
        else:
            df[default_col_name] = 0
            missing_features_list.append(default_col_name)

    try:
        df['name_raw'] = df.get('name', pd.Series(['unknown']*len(df))).fillna('unknown')
        df['name_length'] = df['name_raw'].apply(lambda x: len(str(x)))
        df['follower_ratio'] = df['followers_count'] / (df['friends_count'] + 1)
        df['default_profile_image'] = df['default_profile_image'].astype(int)
        df['protected'] = df['protected'].astype(int)
        df['statuses_count'] = df['statuses_count'].astype(int)
    except Exception as e:
        return jsonify({'error': 'Data Transformation Failed', 'details': str(e)}), 500

    # prediction and downstream processing wrapped to return JSON on error
    try:
        X_predict = df[EXPECTED_FEATURES]
        prediction_probas = MODEL.predict_proba(X_predict)[:, 1]
        df['Suspicion_Score'] = np.round(prediction_probas * 100, 2)
        df['Risk_Analysis'] = df['Suspicion_Score'].apply(assign_risk_category)
        df['Risk_Category'] = df['Risk_Analysis'].apply(lambda x: x['category'])
        df['Risk_Color'] = df['Risk_Analysis'].apply(lambda x: x['color'])
    except Exception as e:

        return jsonify({'error': 'Prediction Failed', 'details': str(e)}), 500

    completness_message = f"{feature_present_count} of {len(RAW_COLUMNS_NEEDED)} critical raw columns present. Missing : {', '.join(missing_features_list) if missing_features_list else 'None' }"
    df_suspicious = df[df['Suspicion_Score'] > 30].sort_values(by = 'Suspicion_Score', ascending=False)
    avg_suspicion = df_suspicious['Suspicion_Score'].mean() if not df_suspicious.empty else 0
    final_table_data = df_suspicious[[
        'id', 'name_raw',  'screen_name', 'Suspicion_Score', 'Risk_Category', 'Risk_Color'
    ]].rename(columns={'id': 'Profile_ID', 'name_raw': 'Name', 'screen_name': 'Handle'})

    globals()['LAST_VIZ_DF'] = df.copy()

    response_data = {
        'completness_score': feature_present_count,
        'completness_total': len(RAW_COLUMNS_NEEDED),
        'completness_message': completness_message,
        'profiles': final_table_data.to_dict('records'),
        'suspicion_score_average': round(avg_suspicion, 0)
    }
    return jsonify(response_data)

In [23]:
@app.route('/viz-data', methods=['GET'])
def viz_data():
    global LAST_VIZ_DF
    if 'LAST_VIZ_DF' not in globals() or LAST_VIZ_DF is None:
        return jsonify({'error': 'No visualization data available yet.'}), 404

    df = LAST_VIZ_DF.copy()

    # split genuine / fake using the same threshold used in batch_predict (>30 suspicious => flagged)
    genuine = df[df['Suspicion_Score'] <= 30] if 'Suspicion_Score' in df.columns else df.iloc[0:0]
    fake = df[df['Suspicion_Score'] > 30] if 'Suspicion_Score' in df.columns else df.iloc[0:0]

    # Followers vs Following scatter (limit to N points to keep payload reasonable)
    N_SAMPLE = 1000
    def ff_records(d):
        cols = []
        if 'followers_count' in d.columns and 'friends_count' in d.columns:
            recs = d[['followers_count','friends_count']].dropna().head(N_SAMPLE)
            cols = recs.to_dict(orient='records')
        return cols
    genuineData = ff_records(genuine)
    fakeData = ff_records(fake)

    # Profile picture distribution (default_profile_image assumed 0 => custom picture, non-zero => default)
    def pic_counts(d):
        if 'default_profile_image' in d.columns:
            has = int((d['default_profile_image'] == 0).sum())
            no = int((d['default_profile_image'] != 0).sum())
        else:
            has, no = 0, len(d)
        return has, no
    g_has, g_no = pic_counts(genuine)
    f_has, f_no = pic_counts(fake)
    profilePicData = [
        {'label': 'Has Picture', 'Genuine': g_has, 'Fake': f_has},
        {'label': 'No Picture', 'Genuine': g_no, 'Fake': f_no},
    ]

    # Privacy (protected vs public)
    def privacy_counts(d):
        if 'protected' in d.columns:
            protected = int(d['protected'].sum())
            public = int(len(d) - protected)
        else:
            protected, public = 0, len(d)
        return [{'name': 'Protected', 'value': protected}, {'name': 'Public', 'value': public}]
    genuinePrivacy = privacy_counts(genuine)
    fakePrivacy = privacy_counts(fake)

    # Feature importance (if model exposes it)
    featureImportance = []
    try:
        import numpy as _np
        if 'MODEL' in globals() and hasattr(MODEL, 'feature_importances_'):
            fi = _np.asarray(MODEL.feature_importances_).tolist()
            for f_name, importance in zip(EXPECTED_FEATURES, fi):
                featureImportance.append({'feature': f_name, 'importance': float(importance)})
    except Exception:
        featureImportance = []

    response = {
        'genuineData': genuineData,
        'fakeData': fakeData,
        'profilePicData': profilePicData,
        'genuinePrivacy': genuinePrivacy,
        'fakePrivacy': fakePrivacy,
        'featureImportance': featureImportance,
    }
    return jsonify(response), 200

In [None]:
# Start Flask server in background with robust logging and provide a small readiness endpoint
# added by coppilot for robustness
import traceback
nest_asyncio.apply()
st = globals().get('server_thread', None)

def _is_thread_alive(t):
    try:
        return bool(t and getattr(t, 'is_alive', lambda: False)())
    except Exception:
        return False

if not _is_thread_alive(st):
    def run_flask_app():
        global server_thread
        try:
            print("[flask] run_flask_app starting on port 8000")
            app.run(debug=False, use_reloader=False, port=8000)
        except Exception as e:
            print("[flask] run_flask_app exception:", e)
            traceback.print_exc()

    server_thread = threading.Thread(target=run_flask_app, daemon=True)
    server_thread.start()
    time.sleep(0.2)
    print("[flask] server_thread started:", _is_thread_alive(server_thread))
else:
    print("[flask] server_thread already running:", _is_thread_alive(st))

@app.route('/viz-ready', methods=['GET'])
def viz_ready():
    st = globals().get('server_thread', None)
    alive = bool(st and getattr(st, 'is_alive', lambda: False)())
    has_df = 'LAST_VIZ_DF' in globals() and globals().get('LAST_VIZ_DF') is not None
    return jsonify({'server_thread_alive': alive, 'has_last_viz_df': bool(has_df)}), 200

[flask] server_thread already running: True


127.0.0.1 - - [23/Oct/2025 11:43:03] "GET /health HTTP/1.1" 200 -
127.0.0.1 - - [23/Oct/2025 11:43:03] "POST /batch-predict HTTP/1.1" 200 -
127.0.0.1 - - [23/Oct/2025 11:43:03] "POST /batch-predict HTTP/1.1" 200 -
127.0.0.1 - - [23/Oct/2025 11:43:07] "GET /viz-data HTTP/1.1" 200 -
127.0.0.1 - - [23/Oct/2025 11:43:07] "GET /viz-data HTTP/1.1" 200 -
127.0.0.1 - - [23/Oct/2025 11:43:08] "GET /viz-data HTTP/1.1" 200 -
127.0.0.1 - - [23/Oct/2025 11:43:08] "GET /viz-data HTTP/1.1" 200 -
