**SUMMARY**

Fits a simple Random Forest classifier using transformers for preprocessing.

First fits the model 'manually' and then checks this against a pipeline.                                 

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score, recall_score

from transformers import *
from sklearn.pipeline import Pipeline

In [2]:
female_pronouns = ["she", "her", "her's", "women", "woman", "lady", "lady's"]
RANDOM_SEED = 932024

# Load Data

In [3]:
# Define cols
y_col, key_col = "bt_pass", "imdbid"

In [4]:
df = pd.read_csv("../data/raw.csv")
print(df.shape)

(7271, 22)


In [5]:
# Split intro train and test
train, test = train_test_split(df, test_size=0.3, random_state=RANDOM_SEED)

# Train Manually

In [6]:
X_cols = [
    "vote_average",
    "vote_count",
    "cast_female_representation",
    "crew_female_representation",
    "year",
    "genres__War",
    "genres__Science Fiction",
    "genres__Mystery", "genres__Drama",
    "genres__Action", "genres__Animation", "genres__Horror",
    "genres__Romance", "genres__Fantasy", "genres__Family",
    "genres__Western", "genres__Music", "genres__Documentary",
    "genres__History", "genres__Thriller", "genres__Adventure",
    "genres__Crime", "genres__Comedy", "genres__TV Movie",
    "pronouns_in_title",
    "revenue__log", 
    "popularity__log", 
    "budget__log"
]

print(len(X_cols))

28


In [7]:
# Transform train
tag_trans = OneHotEncodeFromTags("genres")
train = tag_trans.fit_transform(train)
train = FlagIfStrContains("title", "pronouns_in_title", tokens_to_match=female_pronouns).transform(train)
train = LogAfterZeroReplacement(["revenue", "popularity", "budget"], zero_replacement=1).transform(train)

print(train.shape)
print(train.columns)


# Transform test
test = tag_trans.transform(test)
test = FlagIfStrContains("title", "pronouns_in_title", tokens_to_match=female_pronouns).transform(test)
test = LogAfterZeroReplacement(["revenue", "popularity", "budget"], zero_replacement=1).transform(test)

print(test.shape)
print(test.columns)

# Fit model
rf = RandomForestClassifier(random_state = RANDOM_SEED)
rf.fit(train[X_cols], train[y_col])

# Predict
preds_bin = rf.predict_proba(test[X_cols])[:, 1]
labels_bin = rf.predict(test[X_cols])

# Evaluate
print("AUC: {0:.3f}".format(roc_auc_score(test[y_col], preds_bin, average="weighted")))
print("Average Precision: {0:.3f}".format(average_precision_score(test[y_col], preds_bin)))
print("Precision: {0:.3f}".format(precision_score(test[y_col], labels_bin, average="weighted")))
print("Recall: {0:.3f}".format(recall_score(test[y_col], labels_bin, average="weighted")))

(5089, 45)
Index(['title', 'year', 'bt_score', 'dubious', 'imdbid', 'tmdbId', 'genres',
       'popularity', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'vote_average', 'vote_count', 'cast', 'crew',
       'budget', 'cast_gender', 'crew_gender', 'cast_female_representation',
       'crew_female_representation', 'bt_pass', 'genres__Romance',
       'genres__Family', 'genres__Drama', 'genres__TV Movie',
       'genres__Documentary', 'genres__Horror', 'genres__Crime', 'genres__War',
       'genres__History', 'genres__Western', 'genres__Animation',
       'genres__Action', 'genres__Music', 'genres__Adventure',
       'genres__Science Fiction', 'genres__Thriller', 'genres__Mystery',
       'genres__Comedy', 'genres__Fantasy', 'pronouns_in_title',
       'revenue__log', 'popularity__log', 'budget__log'],
      dtype='object')
(2182, 45)
Index(['title', 'year', 'bt_score', 'dubious', 'imdbid', 'tmdbId', 'genres',
       'popularity', 'production_companies

# Train with Pipeline

In [8]:
# Define Pipeline
pipeline = Pipeline(
    steps=[
        ("genre_tagging", OneHotEncodeFromTags("genres")),
        ("pronoun_matching", FlagIfStrContains("title", "pronouns_in_title",
                                            tokens_to_match=female_pronouns)),
        ("log", LogAfterZeroReplacement(["revenue", "popularity", "budget"], 
                                        zero_replacement=1)),
        ("drop", ColumnDropper([
             y_col,
             "title",
             "bt_score",
             "dubious",
             "imdbid",
             "tmdbId",
             "genres",
             "popularity",
             "production_companies",
             "production_countries",
             "release_date",
             "revenue",
             "cast",
             "crew",
             "budget",
             "cast_gender",
             "crew_gender",
        ])),
        ("model", RandomForestClassifier(random_state=RANDOM_SEED))
    ]
)

In [9]:
pipeline.fit(train.copy(), train[y_col])

In [10]:
# Predict
preds_bin = pipeline.predict_proba(test.copy())[:, 1]
labels_bin = pipeline.predict(test.copy())

# Evaluate
print("AUC: {0:.3f}".format(roc_auc_score(test[y_col], preds_bin, average="weighted")))
print("Average Precision: {0:.3f}".format(average_precision_score(test[y_col], preds_bin)))
print("Precision: {0:.3f}".format(precision_score(test[y_col], labels_bin, average="weighted")))
print("Recall: {0:.3f}".format(recall_score(test[y_col], labels_bin, average="weighted")))

AUC: 0.761
Average Precision: 0.842
Precision: 0.708
Recall: 0.720


In [11]:
# TODO: why are the numbers slightly off?