# Importances and Coefficients - Core

Nena Esaw

In [None]:
# Run the following command on your local computer to check the version of sklearn
import sklearn
!python --version
print(f"sklearn version: {sklearn.__version__}")



In [None]:
## Preprocessing tools
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
## Models & evaluation metrics
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import joblib
## setting random state for reproducibility
SEED = 321
np.random.seed(SEED)
## set pandas to display more columns
pd.set_option('display.max_columns',50)


### Load in the Data

In [None]:
#Load data
df = pd.read_csv("Data/sales_predictions_2023.csv")
df.head()

### Preprocessing

In [None]:
## replace inconsistent categories
fat_content_map = {'LF':'Low Fat',
                   'reg':'Regular',
                   'low fat':'Low Fat'}
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace(fat_content_map)
## Verify 
df['Item_Fat_Content'].value_counts()


In [None]:
## Drop unwanted/inappropriate columns 
y = df['Item_Outlet_Sales'].copy()
bad_cols = ['Outlet_Identifier','Outlet_Establishment_Year']
X = X.drop(columns=bad_cols)
## Perform a train-test-split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

#### Making a Preprocessing Pipeline

In [None]:
## Create categorical pipeline
cat_selector = make_column_selector(dtype_include='object')
# create pipeline for handling categorical data
impute_most_freq = SimpleImputer(strategy='most_frequent')
encoder = OneHotEncoder(handle_unknown='ignore',sparse=False)
cat_pipe = make_pipeline(impute_most_freq,encoder)
## Create numeric pipelien
num_selector = make_column_selector(dtype_include='number')
num_selector(X_train)
# create pipeline for handling categorical data
impute_mean = SimpleImputer(strategy='mean')
scaler = StandardScaler()
num_pipe = make_pipeline(impute_mean, scaler)
## Combine into 1 column transformer
preprocessor = make_column_transformer( (cat_pipe,cat_selector),
                                       (num_pipe,num_selector),
                                      verbose_feature_names_out=False)
preprocessor

In [None]:
## fit column transformer and run get_feature_names_out
preprocessor.fit(X_train)
feature_names = preprocessor.get_feature_names_out()
feature_names

### Project 1 Revisited - Part 1: Remaking, Saving, and Explaining Your Models

* Remake your X_train and X_test as DataFrames with the feature names extracted from the column transformer instead of combining your preprocessor and model into 1 pipeline.

In [None]:
#X_train as dataframe 
X_train_df = pd.DataFrame(preprocessor.transform(X_train),
                           columns = feature_names, index = X_train.index)
X_train_df.head(3)

In [None]:
#X_test as dataframe 
X_test_df = pd.DataFrame(preprocessor.transform(X_test),
                           columns = feature_names, index = X_test.index)
X_test_df.head(3)

In [None]:
## confirm the first 3 rows index in y_test matches X_test_df
y_test.head(3)


Custom Function 

In [None]:
def evaluate_regression(model, X_train,y_train, X_test, y_test): 
    """Evaluates a scikit learn regression model using r-squared and RMSE"""
    
    ## Training Data
    y_pred_train = model.predict(X_train)
    r2_train = metrics.r2_score(y_train, y_pred_train)
    rmse_train = metrics.mean_squared_error(y_train, y_pred_train, 
                                            squared=False)
    
    print(f"Training Data:\tR^2= {r2_train:.2f}\tRMSE= {rmse_train:.2f}")
        
    
    ## Test Data
    y_pred_test = model.predict(X_test)
    r2_test = metrics.r2_score(y_test, y_pred_test)
    rmse_test = metrics.mean_squared_error(y_test, y_pred_test, 
                                            squared=False)
    
    print(f"Test Data:\tR^2= {r2_test:.2f}\tRMSE= {rmse_test:.2f}")

### LinearRegression

   * Fit and evaluate your LinearRegresion model using your dataframe X_train and X_test data.
   
   * Extract and visualize the coefficients that your model determined.
   
        * Select the top 3 most impactful features and interpret their coefficients in plain English.
        
   * Save your figure as a .png file inside your repository (you will need this for the final piece of this assignment - Update Your README).

#### Fitting a linear regression model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train_df, y_train)
evaluate_regression(lin_reg, X_train_df, y_train, X_test_df,y_test)

In [None]:
lin_reg.coef_

In [None]:
## Saving the coefficients
coeffs = pd.Series(lin_reg.coef_, index= feature_names)
coeffs

In [None]:
# use .loc to add the intercept to the series
coeffs.loc['intercept'] = lin_reg.intercept_
coeffs


In [None]:
pd.set_option('display.float_format', lambda x: f"{x:,.2f}")
coeffs

In [None]:
def annotate_hbars(ax, ha='left', va='center', size=12, xytext=(4,0),
                  textcoords='offset points'):
    for bar in ax.patches:
    
        ## calculate center of bar
        bar_ax = bar.get_y() + bar.get_height() / 2
        ## get the value to annotate
        val = bar.get_width()
        if val < 0:
            val_pos = 0
        else:
            val_pos = val
        # ha and va stand for the horizontal and vertical alignment
        ax.annotate(f"{val:.3f}", (val_pos,bar_ax), ha=ha, va=va, size=size,
                        xytext=xytext, textcoords=textcoords)


In [None]:
def plot_coeffs(coeffs, top_n=None, figsize=(4,5), 
                intercept=False, intercept_name="intercept", 
                annotate=False, ha='left', va='center', size=12, 
                xytext=(4,0), textcoords='offset points'):
    """ Plots the top_n coefficients from a Series, with optional annotations.
    """
    # Drop intercept if intercept=False and 
    if (intercept == False) & (intercept_name in coeffs.index):
        coeffs = coeffs.drop(intercept_name)
    if top_n == None:
        ## sort all features and set title
        plot_vals = coeffs.sort_values()
        title = "All Coefficients - Ranked by Magnitude"
    else:
        ## rank the coeffs and select the top_n
        coeff_rank = coeffs.abs().rank().sort_values(ascending=False)
        top_n_features = coeff_rank.head(top_n)
        
        ## sort features and keep top_n and set title
        plot_vals = coeffs.loc[top_n_features.index].sort_values()
        title = f"Top {top_n} Largest Coefficients"
    ## plotting top N importances
    ax = plot_vals.plot(kind='barh', figsize=figsize)
    ax.set(xlabel='Coefficient', 
            ylabel='Feature Names', 
            title=title)
    ax.axvline(0, color='k')
    if annotate == True:
        annotate_hbars(ax, ha=ha, va=va, size=size, xytext=xytext, textcoords=textcoords)
    return ax


In [None]:
plot_coeffs(coeffs, top_n=15, annotate=True);

Select the top 3 most impactful features and interpret their coefficients in plain English

* 
*
*

### Tree-Based Model

   * Fit and evaluate your tree-based regression model using your dataframe X_train and X_test data.
   
   * Extract and visualize the feature importances that your model determined.
   
        * Identify the top 5 most important features.
       
   * Save your figure as a .png file inside your repository  (you will need this for the final piece of this assignment - Update Your README).

#### RandomForestRegressor

In [None]:
reg = RandomForestRegressor(random_state=SEED)
reg.fit(X_train_df, y_train)
evaluate_regression(reg, X_train_df, y_train, X_test_df, y_test)

In [None]:
#Extract and Plot the Feature Importances
reg.feature_importances_

In [None]:
### Saving the importances as a Pandas Series
#saving the features importances 
importances = pd.Series(reg.feature_importances_, index=feature_names,
                       name = 'Feature Importance')
importances

In [None]:
#saving the features importances sorted from largest to smallest (ascending=False)
sorted_importance = importances.sort_values(ascending=False)
sorted_importance

In [None]:
## just keep the top 10 importances and plot (that are now at the bottom of our series)
ax = sorted_importance.tail(10).plot(kind='barh')

Identify the top 5 most important features

*
*
*
*
*

### Serialize Your Best Models with Joblib
   * Once you've finished updating and explaining your models, you must save the following key: value pairs as a dictionary in a joblib file named "best-models.joblib":
   
        * "preprocessor": your preprocessing  column transformer
        
        * "X_train": your training features.
        
        * "X_test": your test features.
        
        * "y_train": your training target.
        
        * "y_test": your test target.
        
        * "LinearRegression": your best linear regression
        
        * Your tree-based model's class name: your best tree-based model.
            * e.g. "RandomForestRegressor"/"DecisionTreeRegressor"
        
        * Save your joblib file inside your repository. (You will work with these models again in the next core assignment.)

In [None]:
## saving variables for next lesson/notebook
import joblib
## creating a dictionary of all of the variables to save for later
export = {'X_train':X_train_df,
         'y_train': y_train,
         'X_test':X_test_df,
          "y_test": y_test,
         'preprocessor':preprocessor,
          'LinearRegression':lin_reg,
         'RandomForest':reg}
joblib.dump(export, 'best-models.joblib')

