In [1]:
%config IPCompleter.greedy=True

# Package Imports and Load Dataset
The 'sample_data.csv' is from Rdatasets. A description of the data can be found here:  
https://vincentarelbundock.github.io/Rdatasets/doc/Stat2Data/FishEggs.html

In [2]:
# Here I am importing the Python package called 'pandas'
# It will help us view and manipulate the data
import pandas
# We will also need the package 'statsmodels' to build a model
import statsmodels.api as sm

# Type your file name in the variable named 'filename' below
filename = 'sample_data.csv'

# This line uses the pandas package to read your data file and load it into the variable named 'data'
data = pandas.read_csv(filename)

# The variable 'data' stores the file in a pandas dataframe. You can do lots of nice things with dataframes, including viewing their head (beginning) or tail (end)
data.head()

Unnamed: 0.1,Unnamed: 0,Age,PctDM,Month,Sept
0,0,7,37.35,Nov,0
1,1,8,38.05,Nov,0
2,2,8,37.45,Nov,0
3,3,9,38.95,Nov,0
4,4,9,37.9,Nov,0


# Set Variables
Record which variable is the outcome variable and which are the predictor variables you want to investigate.  
These all must be numerical variables, not text.

In [3]:
outcome = 'PctDM'
predictors = ['Age', 'Sept']

In [4]:
formula = outcome + '~' + predictors[0] + '+' + predictors[1]
print(formula)
fit = sm.OLS.from_formula(formula, data).fit()
fit.summary()

PctDM~Age+Sept


0,1,2,3
Dep. Variable:,PctDM,R-squared:,0.43
Model:,OLS,Adj. R-squared:,0.394
Method:,Least Squares,F-statistic:,12.06
Date:,"Tue, 26 Oct 2021",Prob (F-statistic):,0.000125
Time:,16:42:11,Log-Likelihood:,-55.144
No. Observations:,35,AIC:,116.3
Df Residuals:,32,BIC:,121.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,39.5192,0.778,50.778,0.000,37.934,41.105
Age,-0.2287,0.063,-3.635,0.001,-0.357,-0.101
Sept,-1.5193,0.423,-3.588,0.001,-2.382,-0.657

0,1,2,3
Omnibus:,1.371,Durbin-Watson:,2.485
Prob(Omnibus):,0.504,Jarque-Bera (JB):,1.067
Skew:,-0.421,Prob(JB):,0.587
Kurtosis:,2.848,Cond. No.,45.7


In [54]:
fit.rsquared

0.4298313140440605

In [55]:
fit.

Intercept    39.519219
Age          -0.228703
Sept         -1.519290
dtype: float64

# Main Algorithms

In [46]:
def get_formula(outcome: str, predictors: list) -> str:
    '''
    This is a function description block.
    If you type 'get_formula?' in a code cell and execute it, this description will pop up.
    You can tell users (or remind yourself) how to use your code with this simple documentation. I also like to track changes.
    
    Last Edited: 10/26/2021
    
    input
        outcome
            string name of outcome variable
        predictors
            list of string names of predictor variables
            
    output
        formula
            string formula to be used in a model
    '''
    formula = outcome + '~'
    for predictor in predictors:
        formula += '+' + predictor
    return formula


def find_important_predictors(data: pandas.DataFrame, outcome: str, predictors: list, importance_measure: str = 'pvalue', importance_threshold: float = 0.05):
    '''
    Last Edited: 10/26/2021
    
    So far this function finds important variables through the process of forward stepwise linear regression
    https://en.wikipedia.org/wiki/Stepwise_regression
    
    Important variables are determined by thier 'importance_measure.' The following importance measures are supported:
        p-value
    
    input
        data
            pandas DataFrame containing the data
        outcome
            string name of outcome variable
        predictors
            list of string names of predictor variables
        importance_measure
            string name of importance measure to use
        importance_threshold
            float threshold used to determine if a variable is important or not
            
    output
        list of string names of important variables
    '''
    # Import the statsmodels package to create models
    import statsmodels.api as sm
    # Use this to create a copy of the predictors list
    from copy import copy
    
    # We want to track which predictor variables we have left to test
    predictors_to_test = copy(predictors)
    # and which predictors are deemed as important and retained in the model
    important_predictors = []
    # We can also track which round we are on
    Round = 0
    
    # Keep looking for more important variables until the list of predictors_to_test is empty
    while bool(predictors_to_test):
        # Print which round we are on
        Round += 1
        print(f'Round: {Round}')
        
        # Set up which variables to get importance measures for
        importance_values = {}
        for predictor in predictors_to_test:
            importance_values[predictor] = None

        # Now add each predictor in predictors_to_test to the model and test its importance
        for predictor in predictors_to_test:
            print(f'Currently Testing Predictor: {predictor}')

            # Create function formula with one additional predictor
            formula = get_formula(outcome=outcome, predictors=important_predictors + [predictor])
            print(f'Current Formula: {formula}\n')
            
            # Fit a linear model with the above formula
            model_fit = sm.OLS.from_formula(formula=formula, data=data).fit()
            
            # Record the importance measure of the new predictor
            if importance_measure == 'pvalue':
                importance_values[predictor] = model_fit.pvalues[len(important_predictors) + 1]
        
        print('Importance Values Found:')
        print(importance_values)
        
        most_important_value = 0
        # Retain the most important new predictor
        if importance_measure == 'pvalue':
            most_important_value = min(importance_values.values())
            
            # If the most important new predictor is not important enough (does not pass the importance_threshold), then stop looking
            if most_important_value > importance_threshold:
                predictors_to_test = []
                print('No more important variables, exiting.')
                pass
            # Otherwise, continue
            
        # Find the most important new predictor, add it to the important_predictors, and remove it from predictors_to_test
        most_important_predictor = list(importance_values.keys())[list(importance_values.values()).index(most_important_value)]
        print(f'Most Important Predictor this Round: {most_important_predictor}\n')
        important_predictors.append(most_important_predictor)
        predictors_to_test.remove(most_important_predictor)
        
    return important_predictors

# Run

In [56]:
important_predictors = find_important_predictors(data=data, outcome=outcome, predictors=predictors)

Round: 1
Currently Testing Predictor: Age
Current Formula: PctDM~+Age

Currently Testing Predictor: Sept
Current Formula: PctDM~+Sept

Importance Values Found:
{'Age': 0.007001273350771227, 'Sept': 0.008018895566107404}
Most Important Predictor this Round: Age

Round: 2
Currently Testing Predictor: Sept
Current Formula: PctDM~+Age+Sept

Importance Values Found:
{'Sept': 0.0010962525075049604}
Most Important Predictor this Round: Sept



In [57]:
important_predictors

['Age', 'Sept']