# AI Tango

## Model Experiments

First, load in dependencies, as well as the ModelBuilder which is meant for rapidly iterating on various options for our datasets and models.

In [14]:
from pose_parser.pose_parser.learning import model_builder as mb
from scipy.stats import randint

model_builder = mb.ModelBuilder()

value_map = {
    "weight_transfer_type": {
        "Failure Weight Transfer": 0,
        "Successful Weight Transfer": 1,
    },
    "step_type": {
        "Left Step": 0,
        "Right Step": 1,
    },
}
drop_list = ["video_id"]

There are a variety of datasets to choose from and more can be generated. Here is a list:


In [None]:
# this one is the original "avg dataset used"
# data_file = "./data/annotated_videos/dataset_1678732901064497000.csv"

# this one includes more pooled stats (max)
# data_file = "./data/annotated_videos/dataset_1679002854718304000.csv"

# this one is 45 frame window pooled
# data_file = "./data/annotated_videos/dataset_1679015606654767000.csv"

# this one is 25 frame window pooled
# data_file = "./data/annotated_videos/dataset_1679016147487099000.csv"

# this one is a flat column representation frame by frame angles of a labeled 10 frame window
# data_file = "./data/annotated_videos/dataset_1679087888313443000.csv"

# this one is a flat column representation frame by frame angles of a labeled 25 frame window
# data_file = "./data/annotated_videos/dataset_1679103956737220000.csv"

# Model / Dataset comparisons

We're looking predict the target variable "weight_transfer_type. Our dataset is pretty imbalanced, with most examples being a "successful weight transfer".


## Dataset

### Flat column representation, frame by frame angles of labeled 25 frame window

In [9]:
data_file = "./data/annotated_videos/dataset_1679103956737220000.csv"

column_whitelist = [
    # "angles_max.line_5_6__line_6_7_angle_2d_degrees",
    # "angles_std.line_5_6__line_25_26_angle_2d_degrees",
    # "angles_avg.line_5_6__line_6_7_angle_2d_degrees",
    # "angles_avg.line_8_9__line_9_10_angle_2d_degrees",
    # "angles_max.line_5_6__line_25_26_angle_2d_degrees",
    # "angles_max.line_2_3__line_25_26_angle_2d_degrees",
    # "angles_avg.line_1_5__line_5_6_angle_2d_degrees",
    # "angles_avg.line_2_3__line_25_26_angle_2d_degrees",
    # "angles_std.line_1_5__line_5_6_angle_2d_degrees",
]
drop_list = ["video_id"]
model_builder.load_and_prep_dataset_from_csv(
    path=data_file,
    target="weight_transfer_type",
    value_map=value_map,
    column_whitelist=column_whitelist,
    drop_list=drop_list,
)

True


First, let's upsample the minority class by just randomly oversampling. Then train the data on a random forest classifier:

Spec:
* Num estimators 600
* Max Depth 9
* Max Leaf Notes: 63

These features are compared to a well-performing AutoML model - SparseNormalizer, XGBoostClassifier


In [10]:
# mb.run_pca(num_components=5)
model_builder.set_train_test_split(
    balance_off_target=True,
    upsample_minority=True,
    downsample_majority=False,
    use_SMOTE=False,
    random_state=123,
)

param_dist = {
    "n_estimators": randint(50, 500),
    "max_depth": randint(1, 20),
    "max_features": randint(3, 20),
}
rf_params = {
    # "class_weight": "balanced_subsample",
    # "class_weight": "balanced",
    "n_estimators": 600,
    "max_depth": 9,
    "max_leaf_nodes": 63,
}

model_builder.train_random_forest(
    use_random_search=False, params=rf_params, param_dist=param_dist
)
model_builder.report()


Training Balance for weight_transfer_type:
1    965
0    965
Name: weight_transfer_type, dtype: int64
Mean ROC AUC from cross validation: 0.963
Min ROC AUC from cross validation: 0.939
Max ROC AUC from cross validation: 0.993
Confusion matrix:
[[ 12  29]
 [ 11 231]]
Accuracy: 0.8586572438162544
Precision: 0.8884615384615384
Recall: 0.9545454545454546
              precision    recall  f1-score   support

           0       0.52      0.29      0.38        41
           1       0.89      0.95      0.92       242

    accuracy                           0.86       283
   macro avg       0.71      0.62      0.65       283
weighted avg       0.84      0.86      0.84       283

Type Random Forest
Data_file ./data/annotated_videos/dataset_1679103956737220000.csv
AUC 0.6236141906873615
Accuracy 0.8586572438162544
Precision 0.8884615384615384
Recall 0.9545454545454546
Confusion_matrix [[ 12  29]
 [ 11 231]]
Feature_importances [0.         0.00014901 0.00039894 ... 0.         0.         0.       

These results show a fairly low AUC (Area under curve).

### Next try SMOTE

In [15]:
# mb.run_pca(num_components=5)
model_builder.set_train_test_split(
    balance_off_target=True,
    upsample_minority=True,
    downsample_majority=False,
    use_SMOTE=True,
    random_state=123,
)

param_dist = {
    "n_estimators": randint(50, 500),
    "max_depth": randint(1, 20),
    "max_features": randint(3, 20),
}
rf_params = {
    # "class_weight": "balanced_subsample",
    # "class_weight": "balanced",
    "n_estimators": 600,
    "max_depth": 9,
    "max_leaf_nodes": 63,
}

model_builder.train_random_forest(
    use_random_search=False, params=rf_params, param_dist=param_dist
)
model_builder.report()


AttributeError: 'ModelBuilder' object has no attribute 'X'

In [None]:
""" WRITE NOTES ON THIS RUN HERE """
notes = """
    Dataset notes:
    Flat column representation of 10 windows of frame data angles

    Model notes:
    PCA on a rand search Random Forest. 
    """
if False:
    model_builder.save_model_and_datasets(notes=notes)