In [4]:
# packages
import pandas as pd
from mod02_build_bot_predictor import train_model

In [3]:
import sys
!"{sys.executable}" -m pip install -U pip
!"{sys.executable}" -m pip install scikit-learn



Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.17.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp311-cp311-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:--:--
   --------------- ------------------------ 3.1/8.1 MB 23.1 MB/s eta 0:00:01
   ---------------------------------------- 8.1/8.1 MB 25.1 MB/s  0:00:00
Using cached joblib-1.5.3-py3-none-any.whl (309 kB)
Downloading scipy-1.17.0-cp311-cp311-win_amd64.whl (36.4 MB)
   ---------------------------------------- 0.0/36.4 MB ? eta -:--:--
   --------- ------------------------------ 8.7/36.4 MB 41.3 MB/s eta 0:00:01
   ------

### Define a function to extract predictions from the model

In [5]:
def predict_bot(df, model=None):
    """
    Predict whether each account is a bot (1) or human (0).
    """
    if model is None:
        model = train_model()

    preds = model.predict(df)
    return pd.Series(preds, index=df.index)

### Define a function to evaluate model error

In [6]:
def confusion_matrix_and_metrics(y_true, y_pred):
    """
    Computes confusion matrix and common error rates for binary classification.

    Assumes labels:
      0 = negative class
      1 = positive class

    Returns:
      dict with:
        tn, fp, fn, tp
        misclassification_rate
        false_positive_rate
        false_negative_rate
    """
    tn = fp = fn = tp = 0

    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
        elif yt == 0 and yp == 1:
            fp += 1
        elif yt == 1 and yp == 0:
            fn += 1
        elif yt == 1 and yp == 1:
            tp += 1
        else:
            raise ValueError("Labels must be 0 or 1")

    total = tn + fp + fn + tp

    misclassification_rate = (fp + fn) / total if total > 0 else 0.0
    false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0.0
    false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0.0

    return {
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "misclassification_rate": misclassification_rate,
        "false_positive_rate": false_positive_rate,
        "false_negative_rate": false_negative_rate,
    }


### Load the data

In [9]:
TRAIN_PATH = "mod02_data/train.csv"
train = pd.read_csv(TRAIN_PATH)

TEST_PATH = "mod02_data/test.csv"
test = pd.read_csv(TEST_PATH)

### Format the data by independent vs. dependent variables

In [10]:
X_train = train.drop(columns=["is_bot"])
y_train = train['is_bot']

X_test = test.drop(columns=["is_bot"])
y_test = test['is_bot']

### Build the model on training data

In [11]:
model = train_model(X_train, y_train)

### Get the model predictions on training and test data

In [12]:
y_pred_train = predict_bot(X_train, model)
y_pred_test = predict_bot(X_test, model)

### Check results on the training set (data used to build the model)

In [13]:
confusion_matrix_and_metrics(y_train, y_pred_train)

{'tp': 68,
 'tn': 2613,
 'fp': 24,
 'fn': 295,
 'misclassification_rate': 0.10633333333333334,
 'false_positive_rate': 0.009101251422070534,
 'false_negative_rate': 0.8126721763085399}

### Check results on the test set (new data not yet seen by the model)

In [14]:
confusion_matrix_and_metrics(y_test, y_pred_test)

{'tp': 17,
 'tn': 871,
 'fp': 3,
 'fn': 109,
 'misclassification_rate': 0.112,
 'false_positive_rate': 0.003432494279176201,
 'false_negative_rate': 0.8650793650793651}

# Discussion Questions

### Based on the misclassification rate of your model, discuss your confidence in the ability to predict a bot. 

I would have low confidence in the ability to predict a bot. The overall accuracy from the dataset is more than likely fine becasue there are a lot more humans than bots, howevver, the false_negative rate is so high so it plays it safe and misses a majority of the bots.

### What are potential ramifications of false positives from the model?

The ramifications of false positives could mean that real users would get banned, accounts could be flagged, and this would result in an increase of customer support outreach. Though there is a small false positive rate, it stil could mess with a lot of users. 

### What are potential ramifications of false negatives from the model?

The false negatives could result in a lot of spam accounts not being flagged, more misinformation, fake engagment, botted followers or metrics being messed with. Unfortunatly, apps such as instagram have struggled often to remove a majority of the bots from their platforms, so it is difficult to really stop the problem.