<h1 style='background:#afd873; border:10; border-radius: 25px; font-size:250%; font-weight: bold; color:black'><center><a href="https://medium.com/@dima806/ratings-for-900-free-coursera-courses-shap-values-for-providing-institutions-skills-and-more-4fb75105a21f">See my paper on Medium.com for more details</a></center></h1> 

<a id='top'></a>
<div class="list-group" id="list-tab" role="tablist">
    
<h1 style='background:#afd873; border:10; border-radius: 25px; font-size:250%; font-weight: bold; color:black'><center> Table of contents </center></h1>

### [**1. Importing libraries and loading data**](#section-load)

### [**2. Data transformation**](#section-transform)
    
### [**3. Machine learning**](#section-model)

### [**4. Explanations with SHAP values**](#section-explain)

<a id="section-load"></a>
<h1 style='background:#afd873; border:10; border-radius: 25px; font-size:250%; font-weight: bold; color:black'><center>Importing libraries and loading data</center></h1>

In [None]:
!pip install -q feature_engine

In [None]:
# import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import shap
import matplotlib.pyplot as plt
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from feature_engine.encoding import RareLabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import re

import warnings 
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 1000)

In [None]:
df = pd.read_csv('/kaggle/input/coursera-free-courses-dataset/courserafree.csv', index_col=False)
print(df.shape)
df.sample(5).T

<a id="section-transform"></a>
<h1 style='background:#afd873; border:10; border-radius: 25px; font-size:250%; font-weight: bold; color:black'><center>Data transformation</center></h1>

In [None]:
# select main label
main_label = 'ratings'
# remove null ratings
df = df[~df[main_label].isnull()]
# rename column names
df.rename(columns = {c: c.strip('\xa0') for c in df.columns}, inplace = True)
# expand level, type and duration, following 
# https://www.kaggle.com/code/kazakow/data-visualization-coursera-with-seaborn
df = pd.concat([df, df['level type duration'].str.split(' · ', expand=True)], axis=1)
df.rename(columns={0: 'level', 1: 'type', 2: 'duration'}, inplace=True)
# extract number of reviews
df['reviews'].fillna('(0 reviews)', inplace=True)
df['reviews'] = [eval(i[1:-9].replace('k', '*1000')) for i in df.reviews.values]
# drop courses with < 20 reviews
df = df[df['reviews']>=20]
# set up the rare label encoder limiting number of categories to max_n_categories
for col in ['institution', 'duration']:
    df[col] = df[col].fillna('None')
    encoder = RareLabelEncoder(n_categories=1, max_n_categories=50, replace_with='Other', tol=20.0/df.shape[0])
    df[col] = encoder.fit_transform(df[[col]])
# vectorize skills columns
df.rename(columns = {'skills you will gain': 'skills'}, inplace = True)
def vectorize_column(df, col_name, sep=', ', min_df=20):
    ll = df[col_name].fillna('none').str.split(sep).to_list()
    ll = [[j.rstrip(', ').strip(' ').replace('.', '_').replace('\'', '_').replace(' ', '_').replace('.', '_').replace('&', '_and_').replace('-', '_').replace('(', '_').replace(')', '_') for j in i] for i in ll]
    ll1 = []
    for item in ll:
        if item != ['none']:
            ttt = ' '.join(item)
        else:
            ttt = 'none'
        ll1.append(ttt)
    vectorizer = CountVectorizer(min_df=min_df, lowercase=False)
    vectorizer.fit(ll1)
    voc = vectorizer.vocabulary_
    voc_inv = {v: col_name+'_'+k for k, v in voc.items()}
    vector = vectorizer.transform(ll1)
    tt = pd.DataFrame(vector.toarray())
    tt = tt.rename(columns=voc_inv)
    df = pd.concat([df.reset_index(drop=True),tt.reset_index(drop=True)], axis=1).drop([col_name], axis=1)
    return df
for col in ['skills']:
    df = vectorize_column(df, col, sep=', ', min_df=20)
# drop unused columns
cols2drop = ['css-1qajodb', 'title', 'level type duration', 'reviews', 'url', 'price']
df = df.drop(cols2drop, axis=1)
print(df.shape)
df.sample(5).T

In [None]:
df['ratings'].describe()

<a id="section-model"></a>
<h1 style='background:#afd873; border:10; border-radius: 25px; font-size:250%; font-weight: bold; color:black'><center>Machine learning</center></h1>

In [None]:
# initialize data
y = df[main_label].values.reshape(-1,)
X = df.drop([main_label], axis=1)
cat_cols = df.select_dtypes(include=['object']).columns
cat_cols_idx = [list(X.columns).index(c) for c in cat_cols]
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.5, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# initialize Pool
train_pool = Pool(X_train, 
                  y_train, 
                  cat_features=cat_cols_idx)
test_pool = Pool(X_test,
                 y_test,
                 cat_features=cat_cols_idx)
# specify the training parameters 
model = CatBoostRegressor(iterations=500, 
                          depth=5,
                          verbose=0,
                          learning_rate=0.02, 
                          loss_function='RMSE')
#train the model
model.fit(train_pool)
# make the prediction using the resulting model
y_train_pred = model.predict(train_pool)
y_test_pred = model.predict(test_pool)

rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
rmse_test = mean_squared_error(y_test, y_test_pred, squared=False)
print(f"RMSE score for train {round(rmse_train,3)} points, and for test {round(rmse_test,3)} points")

In [None]:
# Baseline scores (assuming the same prediction for all data samples)
rmse_bs_train = mean_squared_error(y_train, [np.mean(y_train)]*len(y_train), squared=False)
rmse_bs_test = mean_squared_error(y_test, [np.mean(y_train)]*len(y_test), squared=False)
print(f"RMSE baseline score for train {round(rmse_bs_train,3)} points, and for test {round(rmse_bs_test,3)} points")

<a id="section-explain"></a>
<h1 style='background:#afd873; border:10; border-radius: 25px; font-size:250%; font-weight: bold; color:black'><center>Explanations with SHAP values</center></h1>

In [None]:
shap.initjs()
ex = shap.TreeExplainer(model)
shap_values = ex.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

In [None]:
expected_values = ex.expected_value
print(f"Average predicted rating is {round(expected_values,3)} points")
print(f"Average actual rating is {round(np.mean(y_test),3)} points")

In [None]:
def show_shap(col):
    df_infl = X_test.copy()
    df_infl['shap_'] = shap_values[:,X_test.columns.tolist().index(col)]
    gain = 0
    gain = df_infl.groupby(col).mean().loc[1, 'shap_']-df_infl.groupby(col).mean().loc[0, 'shap_']
    gain_std = ((df_infl.groupby(col).std().loc[1, 'shap_'])**2+(df_infl.groupby(col).std().loc[0, 'shap_'])**2)**0.5
    return round(gain,4), round(gain_std,4)
def show_variants(start_str):
    col_list = []
    gain_list = []
    gain_std_list = []
    for col in X_test.columns:
        if col.startswith(start_str+'_'):
            try:
                gain, gain_std = show_shap(col)
                col_list.append(col.replace(start_str+'_',''))
                gain_list.append(gain)
                gain_std_list.append(gain_std)
            except:
                pass
    dd_dict = {'col': col_list, 'gain': gain_list, 'gain_std': gain_std_list}
    df_res = pd.DataFrame.from_dict(dd_dict).sort_values('gain', ascending=False).set_index('col')
    plt.figure(figsize=(15,10))
    plt.errorbar(df_res.index, df_res['gain'], yerr=df_res['gain_std'], fmt="o", color="r")
    plt.bar(x=df_res.index, height=df_res['gain'])
    plt.title(f'SHAP values for {start_str}, rating')
    plt.ylabel('points')
    plt.tick_params(axis="x", rotation=90)
    plt.show();
    return df_res

In [None]:
show_variants(start_str='skills')

In [None]:
def show_shap(col, shap_values=shap_values, label=main_label, X_test=X_test, ylabel='points'):
    df_infl = X_test.copy()
    df_infl['shap_'] = shap_values[:,df_infl.columns.tolist().index(col)]
    gain = round(df_infl.groupby(col).mean()['shap_'],4)
    gain_std = round(df_infl.groupby(col).std()['shap_'],4)
    cnt = df_infl.groupby(col).count()['shap_']
    dd_dict = {'col': list(gain.index), 'gain': list(gain.values), 'gain_std': list(gain_std.values), 'count': cnt}
    df_res = pd.DataFrame.from_dict(dd_dict).sort_values('gain', ascending=False).set_index('col')
    plt.figure(figsize=(12,9))
    plt.errorbar(df_res.index, df_res['gain'], yerr=df_res['gain_std'], fmt="o", color="r")
    plt.title(f'SHAP values for column {col}, label {label}')
    plt.ylabel(ylabel)
    plt.tick_params(axis="x", rotation=90)
    plt.show();
    print(df_res)
    return

for col in X_test.columns:
    if col.split('_')[0] not in ['skills']:
        print()
        print(col)
        print()
        show_shap(col, shap_values, label=main_label, X_test=X_test)

<h1 style='background:#afd873; border:10; border-radius: 25px; font-size:250%; font-weight: bold; color:black'><center><a href="https://medium.com/@dima806/ratings-for-900-free-coursera-courses-shap-values-for-providing-institutions-skills-and-more-4fb75105a21f">See my paper on Medium.com for more details</a></center></h1> 

In [None]:
nan