In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

##  Predicting Booster
Create a classification model that tells us whether or not universities mandate a booster. Uses preprocessing from "Covid Model Creation" notebook.

First, I'll train a model to classify the universities that required the vaccine and those that didn't. Then, I may try a multi-level classification approach if the data I find are rich enough, with three options: one for no mandate, one for a regular mandate, and one for a booster mandate.

In [4]:
target_booster = pd.read_pickle('target_booster.pkl')
features_booster = pd.read_pickle('features_booster.pkl')

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
categorical_preprocessor = OneHotEncoder(drop='first') # drop to avoid multicollinearity
numerical_preprocessor = StandardScaler() # normalize data to make it easier for sklearn models to handle

In [7]:
from sklearn.compose import ColumnTransformer # splits the column, transforms each subset differently, then concatenates
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_booster, target_booster, random_state=42)
categorical_columns = ['ranking', 'Type', 'political_control_state', 'Region']
numerical_columns = list(set(features_booster.columns).difference(categorical_columns))
preprocessor = ColumnTransformer([('one-hot-encoder', categorical_preprocessor, categorical_columns),
                                  ('standard_scaler', numerical_preprocessor, numerical_columns)])

Now train models. Start with Logistic Regression.

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
pipe_log = make_pipeline(preprocessor, LogisticRegression())
pipe_log.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(drop='first'),
                                                  ['ranking', 'Type',
                                                   'political_control_state',
                                                   'Region']),
                                                 ('standard_scaler',
                                                  StandardScaler(),
                                                  ['total_population',
                                                   '2020.student.size',
                                                   'county_vote_diff',
                                                   'announce_date',
                                                   'avg_community_level',
                                                   'median_income',
                                          

In [10]:
pipe_log.score(X_test, y_test) # mean accuracy

0.7142857142857143

In [11]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
def show_metrics_classification(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f'accuracy: {model.score(X_test, y_test)}')
    print(f'F1 Score: {f1_score(y_test, y_pred)}') # equal weight to precision and recall
    print(classification_report(y_test, y_pred))

In [12]:
show_metrics_classification(pipe_log, X_test, y_test)

accuracy: 0.7142857142857143
F1 Score: 0.375
              precision    recall  f1-score   support

           0       0.71      0.96      0.81        23
           1       0.75      0.25      0.38        12

    accuracy                           0.71        35
   macro avg       0.73      0.60      0.59        35
weighted avg       0.72      0.71      0.66        35



Notes:
- can use stratified CV to preserve class ratios for train and test datasets