In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [10]:
DATA_FILE = './output-files/processed_users.csv'
MODEL_OUTPUT_FILE = './random_forest_model.pkl'

In [11]:
FEATURES = [
    'default_profile_image',
    'statuses_count',
    'followers_count',
    'friends_count',
    'protected',
    'name_length',
    'follower_ratio'
]

TARGET = 'is_fake'

In [12]:
try:
    data = pd.read_csv(DATA_FILE)

    X = data[FEATURES]
    Y = data[TARGET]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)

    print(f"Training Model on : {len(X_train)} profiles")
    print(f"Testing Model on : {len(X_test)} profiles")

    print("Starting Model Training...")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, Y_train)
    print("Model Training Completed")
    print("----------------------------------------------------")

    print("Evaluating Model...")
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)

    print("-------PoC Performance Report-------")
    print(f"Overall Accuracy: {accuracy*100: .2f}%")
    print("\nDetailed Classification Report:")
    print(classification_report(Y_test, Y_pred, target_names=['Genuine(0)', 'Fake(1)']))

    joblib.dump(model, MODEL_OUTPUT_FILE)
    print(f"Trained Model saved at {MODEL_OUTPUT_FILE}")

except FileNotFoundError:
    print(f"Error: The dataset file {DATA_FILE} was not found.")
except KeyError as e:
    print(f"Error: One of the defined columns was not found in the CSV. Check feature names: {e}")

Training Model on : 5460 profiles
Testing Model on : 1365 profiles
Starting Model Training...
Model Training Completed
----------------------------------------------------
Evaluating Model...
-------PoC Performance Report-------
Overall Accuracy:  98.39%

Detailed Classification Report:
              precision    recall  f1-score   support

  Genuine(0)       0.99      0.98      0.98       695
     Fake(1)       0.98      0.99      0.98       670

    accuracy                           0.98      1365
   macro avg       0.98      0.98      0.98      1365
weighted avg       0.98      0.98      0.98      1365

Trained Model saved at ./random_forest_model.pkl
