# Package Import and Load Dataset
The 'sample_data.csv' is from Rdatasets. A description of the data can be found here:  
https://vincentarelbundock.github.io/Rdatasets/doc/Stat2Data/FishEggs.html

In [1]:
# Here I am importing the Python package called 'pandas'
# It will help us view and manipulate the data
import pandas

# Type your file name in the variable named 'filename' below
# Note: This file must be in the same folder as this program OR you can specify the relative path to your file from this program
filename = 'sample_data.csv'

# This line uses the pandas package to read your data file and load it into the variable named 'data'
data = pandas.read_csv(filename)

# The variable 'data' stores the file in a pandas dataframe. You can do lots of nice things with dataframes, including viewing their head (beginning) or tail (end)
data.head()

Unnamed: 0.1,Unnamed: 0,Age,PctDM,Month,Sept
0,0,7,37.35,Nov,0
1,1,8,38.05,Nov,0
2,2,8,37.45,Nov,0
3,3,9,38.95,Nov,0
4,4,9,37.9,Nov,0


# Main Algorithm(s)
Stepwise Regression: 
https://en.wikipedia.org/wiki/Stepwise_regression

In [2]:
def get_formula(outcome: str, predictors: list) -> str:
    '''
    This is a function description block.
    If you type 'get_formula?' in a code cell and execute it, this description will pop up.
    You can tell users (or remind yourself) how to use your code with this simple documentation. I also like to track changes.
    
    Last Edited: 10/27/2021
    
    Input
        outcome
            string name of outcome variable
        predictors
            list of string names of predictor variables
            
    Output
        formula
            string formula to be used in a model
    '''
    formula = outcome + '~'
    for i in range(len(predictors)):
        if i == len(predictors) - 1:
            formula += predictors[i]
        else:
            formula += predictors[i] +'+'
    return formula


def find_important_predictors(data: pandas.DataFrame, outcome: str, predictors: list, importance_measure: str, importance_threshold: float=None):
    '''
    Last Edited: 10/27/2021
    
    So far this function finds important variables through the process of forward stepwise linear regression
    https://en.wikipedia.org/wiki/Stepwise_regression
    
    Important predictors are determined by thier 'importance_measure.' The following importance_measures are supported with thier default importance_threshold's.
    
    Supported Importance Measures w/Default Importance Thresholds:
        p-value:           0.05
        adjusted_rsquared: Lower is Better
        aic:               Lower is Better
        bic:               Lower is Better
        mse:               Lower is Better
    
    Input
        data
            pandas DataFrame containing the data
        outcome
            string name of outcome variable
        predictors
            list of string names of predictor variables
        importance_measure
            string name of importance measure to use
        importance_threshold
            None or float threshold used to determine if a variable is important or not
            if None, then algorithm will use default threshold based on importance_measure
            
    Output
        list of string names of important variables
    '''
    
    #-----------------------------------------SETUP-------------------------------------
    
    # Import the statsmodels package to create models
    from statsmodels.api import OLS
    # Use the copy method from the copy packages to create a copy of the predictors list
    from copy import copy
    
    # Set up default_importance_thresholds
    default_importance_thresholds = {'p-value':0.05, 'adjusted_rsquared':0, 'aic':0, 'bic':0, 'mse':0} # TODO: UPDATE
    if importance_threshold is None:
        importance_threshold = default_importance_thresholds[importance_measure]
    # Record which which side of the threshold on which we say variables are important
    importance_threshold_keep_sides = {'p-value':'lower', 'adjusted-rsquared':'lower', 'aic':'lower', 'bic':'lower', 'mse':'lower'}
    # TODO: remove this since all are 'lower'?
    importance_threshold_keep_side = importance_threshold_keep_sides[importance_measure]
    
    # We want to track which predictor variables we have left to test
    predictors_to_test = copy(predictors)
    # and which predictors are deemed as important and retained in the model
    important_predictors = []
    # We can also track which round we are on
    Round = 0
    
    #-----------------------------------------MAIN ALGORITHM-------------------------------------
    # Keep looking for more important variables until the list of predictors_to_test is empty
    while bool(predictors_to_test):
        # Print which round we are on
        Round += 1
        print(f'Round: {Round}')
        
        # Set up which variables to get importance measures for
        importance_values = {}
        for predictor in predictors_to_test:
            importance_values[predictor] = None
        
        # Find baseline importance_measures for model before adding next round of predictors
        baseline_importance_measures = {}
        if Round == 1:
            baseline_importance_measures = {'p-value':None, 'adjusted_rsquared':10000, 'aic':10000, 'bic':10000, 'mse':10000}
        else:
            baseline_formula = get_formula(outcome=outcome, predictors=important_predictors)
            baseline_model = OLS.from_formula(formula=baseline_formula, data=data).fit()
            baseline_importance_measures['p-value'] = None
            baseline_importance_measures['adjusted_rsqared'] = baseline_model.rsquared_adj
            baseline_importance_measures['aic'] = baseline_model.aic
            baseline_importance_measures['bic'] = baseline_model.bic
            baseline_importance_measures['mse'] = baseline_model.mse_model
        print(f'Baseline {importance_measure} Value: {baseline_importance_measures[importance_measure]}\n')

        # Now add each predictor in predictors_to_test to the model and test its importance
        for predictor in predictors_to_test:
            print(f'Currently Testing Predictor: {predictor}')

            # Create function formula with one additional predictor
            formula = get_formula(outcome=outcome, predictors=important_predictors + [predictor])
            print(f'Current Formula: {formula}\n')
            
            # Fit a linear model with the above formula
            current_model = OLS.from_formula(formula=formula, data=data).fit()
            
            # Record the importance measure of the new predictor
            if importance_measure == 'p-value':
                importance_values[predictor] = current_model.pvalues[len(important_predictors) + 1]
            elif importance_measure == 'aic':
                importance_values[predictor] = current_model.aic - baseline_importance_measures['aic']
            elif importance_measure == 'bic':
                importance_values[predictor] = current_model.bic - baseline_importance_measures['bic']
            elif importance_measure == 'mse':
                importance_values[predictor] = current_model.mse_model - baseline_importance_measures['mse']
        
        print(f'{importance_measure} Values Found:')
        print(importance_values)
        
        most_important_value = 0
        # Retain the most important new predictor's importance value
        if importance_threshold_keep_side == 'lower':
            most_important_value = min(importance_values.values())
            
            # If the most important new predictor is not important enough (does not pass the importance_threshold), then stop looking
            if most_important_value > importance_threshold:
                predictors_to_test = []
                print('No more important variables, exiting.')
                return important_predictors
            
        elif importance_threshold_keep_side == 'upper':
            most_important_value = max(importance_values.values())
            
            # If the most important new predictor is not important enough (does not pass the importance_threshold), then stop looking
            if most_important_value < importance_threshold:
                predictors_to_test = []
                print('No more important variables, exiting.')
                return important_predictors
        
        else:
            print('Importance measure is not supported.')
            return
            
        # Find the most important predictor of this round and print it
        most_important_predictor = list(importance_values.keys())[list(importance_values.values()).index(most_important_value)]
        print(f'Most Important Predictor this Round: {most_important_predictor}\n')
        # Add most important predictor to the important_predictors
        important_predictors.append(most_important_predictor)
        # Remove most important predictor from predictors_to_test
        predictors_to_test.remove(most_important_predictor)
        
    return important_predictors

# Set Parameters and Run
Record which variable is the outcome variable and which are the predictor variables you want to investigate.
These all must be numerical variables, not text.

In [3]:
# Set Parameters
outcome = 'PctDM'
predictors = ['Age', 'Sept']
importance_measure = 'p-value'
importance_threshold = 0.05

# Run Algorithm
important_predictors = find_important_predictors(
    data = data, 
    outcome = outcome, 
    predictors = predictors, 
    importance_measure = importance_measure
)

Round: 1
Baseline p-value Value: None

Currently Testing Predictor: Age
Current Formula: PctDM~Age

Currently Testing Predictor: Sept
Current Formula: PctDM~Sept

p-value Values Found:
{'Age': 0.007001273350771227, 'Sept': 0.008018895566107404}
Most Important Predictor this Round: Age

Round: 2
Baseline p-value Value: None

Currently Testing Predictor: Sept
Current Formula: PctDM~Age+Sept

p-value Values Found:
{'Sept': 0.0010962525075049604}
Most Important Predictor this Round: Sept



In [4]:
important_predictors

['Age', 'Sept']