In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Load Data

In [3]:
from pathlib import Path
import pandas as pd
import joblib

In [4]:

data_folder_path = Path("./gdrive/MyDrive/Colab Notebooks/MO436/data")
new_format_data_path = Path("./gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20-new-format/")
old_format_data_path = Path("./gdrive/MyDrive/Colab Notebooks/MO436/data/Twibot-20-old-format/ETL_Twi20")
embeddings_data_folder_path = data_folder_path/"text_embeddings"


In [5]:


graph_embs_names = joblib.load(data_folder_path/"graph_emb_names.pkl")
numerical_features = joblib.load(data_folder_path/"numerical_features_names.pkl")
categorical_features = joblib.load(data_folder_path/"categorical_features_names.pkl")
tweet_emb_names = joblib.load(data_folder_path/"tweet_emb_names.pkl")
description_emb_names = joblib.load(data_folder_path/"description_emb_names.pkl")
name_emb_names = joblib.load(data_folder_path/"profile_name_emb_names.pkl")
screen_name_emb_names = joblib.load(data_folder_path/"screen_name_emb_names.pkl")

raw_text_embs_names = tweet_emb_names + description_emb_names + name_emb_names + screen_name_emb_names

tsvd_name_embs = [f"tsvd_name_embs{i}" for i in range(20)]
tsvd_screen_name_embs = [f"tsvd_screen_name_embs{i}" for i in range(20)]
tsvd_tweet_embs = [f"tsvd_tweets_embs{i}" for i in range(20)]
tsvd_description_embs = [f"tsvd_description_embs{i}" for i in range(20)]
all_tsvd_embs = tsvd_name_embs + tsvd_screen_name_embs + tsvd_tweet_embs + tsvd_description_embs


ssvd_name_embs = [f"ssvd_name_embs{i}" for i in range(20)]
ssvd_screen_name_embs = [f"ssvd_screen_name_embs{i}" for i in range(20)]
ssvd_tweet_embs = [f"ssvd_tweets_embs{i}" for i in range(20)]
ssvd_description_embs = [f"ssvd_description_embs{i}" for i in range(20)]
all_ssvd_embs = ssvd_name_embs + ssvd_screen_name_embs + ssvd_tweet_embs + ssvd_description_embs

In [6]:
profile_features = numerical_features + categorical_features
graph_features = graph_embs_names

all_features_ssvd = profile_features + graph_features + all_ssvd_embs
all_features_tsvd = profile_features + graph_features + all_tsvd_embs
all_features_raw = profile_features + graph_features + raw_text_embs_names

In [24]:
for i in numerical_features:
  print(f"-{i}")


-followers_follow_proportion
-tenure
-tweet_time_density
-follower_time_density
-followers_count
-friends_count
-listed_count
-favourites_count
-statuses_count
-tweet_count
-following_count
-has_location


In [25]:
for i in categorical_features:
  print(f"-{i}")


-protected
-geo_enabled
-verified
-lang
-contributors_enabled
-is_translator
-is_translation_enabled
-profile_background_tile
-profile_use_background_image
-has_extended_profile
-default_profile
-default_profile_image


In [12]:
df = pd.read_parquet(data_folder_path/"preprocessed_profile_and_text_features.parquet")


In [8]:

# fs_svd = pd.read_parquet(data_folder_path/"feature_selection_tsvd_logs.parquet")
# fs_supervised_svd = pd.read_parquet(data_folder_path/"feature_selection_ssvd_logs.parquet")

# Modelling

In [7]:
train_mask = lambda d: (d["split"] == "train") & (d["random_number"] >= 0.2)
test_mask = lambda d: d["split"] == "test"
support_mask = lambda d: d["split"] == "support"
val_mask = lambda d: (d["split"] == "train") & (d["random_number"] < 0.2)

target = "label"


In [8]:
!pip install shap -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/532.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/532.9 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m532.5/532.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.9/532.9 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
from sklearn.base import clone
import numpy as np
import pandas as pd
import shap
from tqdm import tqdm

from toolz import curry
from sklearn.metrics import roc_auc_score

@curry
def fast_metric_with_ci_(data, *, n_samples=100, ci_level=0.95,
                     prediction='prediction', target='target', weight='weight', metric_fn = roc_auc_score):

    data = data.assign(weight__=lambda df: df[weight] if weight is not None else 1)
    summary = (
        data
        .assign(
            prediction=lambda df: (1000 * df[prediction]).round(),
        )
        .groupby(["weight__", 'prediction', target])
        .size().to_frame("sample_size")
        .reset_index()
    )

    estimate = (
        summary
        .assign(weight__=lambda df: df["weight__"] * df['sample_size'])
        .pipe(lambda df: metric_fn(df[target], df['prediction'], sample_weight=df['weight__']))
    )

    bs_values = [
        summary
        .assign(weight__=lambda df: df["weight__"] * np.random.poisson(df['sample_size']))
        .pipe(lambda df: metric_fn(df[target], df['prediction'], sample_weight=df['weight__']))
    for _ in range(n_samples)]

    lo, hi = bootstrap_ci(estimate, bs_values, ci_level=ci_level)

    return pd.Series(dict(
        estimate=estimate,
        ci_upper=hi,
        ci_lower=lo,
        model=prediction
    ))


def bootstrap_ci(sample_estimate, bootstrap_estimates, ci_level=0.95):
    lo = 2 * sample_estimate - np.quantile(bootstrap_estimates, (1 + ci_level) / 2)
    hi = 2 * sample_estimate - np.quantile(bootstrap_estimates, (1 - ci_level) / 2)
    return lo, hi



@curry
def fast_delta_metric_with_ci_(data, baseline, challenger, *, n_samples=100, ci_level=0.95,
                           target='target', weight='weight', metric_fn = roc_auc_score):

    data = data.assign(weight__=lambda df: df[weight] if weight is not None else 1)

    summary = (
        data
        .assign(**{
            baseline: lambda df: (1000 * df[baseline]).round(),
            challenger: lambda df: (1000 * df[challenger]).round(),
        })
        .groupby(["weight__", baseline, challenger, target])
        .size().to_frame("sample_size")
        .reset_index()
    )


    def delta_auc(df):
        challenger_auc = metric_fn(df[target], df[challenger], sample_weight=df['weight__'])
        baseline_auc = metric_fn(df[target], df[baseline], sample_weight=df['weight__'])
        return challenger_auc - baseline_auc

    estimate = (
        summary
        .assign(weight__=lambda df: df["weight__"] * df['sample_size'])
        .pipe(delta_auc)
    )

    bs_values = [
        summary
        .assign(weight__=lambda df: df["weight__"] * np.random.poisson(df['sample_size']))
        .pipe(delta_auc)
    for _ in range(n_samples)]

    lo, hi = bootstrap_ci(estimate, bs_values, ci_level=ci_level)

    return pd.Series(dict(
        estimate=estimate,
        ci_upper=hi,
        ci_lower=lo,
        model=challenger
    ))


@curry
def fast_delta_metric_with_ci(data, baseline, challengers, target, *, n_samples=100, ci_level=0.95, weight='weight', metric_fn = roc_auc_score):

    fn = fast_delta_metric_with_ci_(
        baseline=baseline,
        n_samples=n_samples,
        ci_level=ci_level,
        target=target,
        weight=weight,
        metric_fn=metric_fn
      )

    all_values = [fn(data=data,challenger=c) for c in challengers]

    return pd.DataFrame(all_values)

@curry
def fast_metric_with_ci(data, predictions, target, *, n_samples=100, ci_level=0.95, weight='weight', metric_fn = roc_auc_score):

    fn = fast_metric_with_ci_(
        target=target,
        n_samples=n_samples,
        ci_level=ci_level,
        weight=weight,
        metric_fn=metric_fn
      )

    all_values = [fn(data=data,prediction=p) for p in predictions]

    return pd.DataFrame(all_values)

def log_odds_to_proba(x):
  return 1/(1+np.exp(-x))

def proba_to_log_odds(p):
  return np.log(p/(1-p))


def backwards_shap_feature_selection(
    model,
    df_train,
    df_val,
    candidate_features_for_removal,
    target,
    null_hypothesis = "feature_is_good",
    fixed_features=[],
    n_features_sample=None,
    extra_validation_sets = {},
    sample_weight=None,
    metric_fn = roc_auc_score,
    bootstrap_samples=20,
    ci_level=0.8,
    max_iter = 10,
    patience=0,
    max_removals_per_run=None
):

  """
  """
  #TODO: implement two null hypothesis strategies. currently only "all_features_are_good"


  #check key names
  valid_nulls = ["feature_is_good","feature_is_bad"]
  if not null_hypothesis in valid_nulls:
      raise(ValueError(f"null_hypothesis should be one of {valid_nulls}, got {null_hypothesis}"))

  keys_intersections = set(extra_validation_sets.keys()) & set(candidate_features_for_removal + fixed_features)
  if keys_intersections:
    raise ValueError(f"extra_validation_sets names should not match names of features. Found {keys_intersections}")

  keys_intersections = keys_intersections & set(["metric", "error-contribution"])
  if keys_intersections:
    raise ValueError(f"extra_validation_sets names or feature names should not be 'metric' or 'error-contribution'. Found {keys_intersections}")

  all_logs = []
  p=0
  for i in tqdm(range(max_iter)):

    #set all features
    all_features = candidate_features_for_removal + fixed_features

    if len(all_features) == 0:
      break

    if (n_features_sample is None) or (len(all_features) <= n_features_sample):
      features_to_use = all_features
    else:
      features_to_use = np.random.choice(all_features, n_features_sample, replace=False)

    run_logs = _backwards_shap_feature_selection(
        model=clone(model),
        df_train=df_train,
        df_val=df_val,
        all_features=features_to_use,
        extra_validation_sets=extra_validation_sets,
        target=target,
        sample_weight=sample_weight,
        metric_fn=metric_fn,
        bootstrap_samples=bootstrap_samples,
        ci_level=ci_level,
    )

    if null_hypothesis == "feature_is_good":
      features_to_remove = (
          run_logs
          [lambda d: d["ci_lower"] > 0]
          [lambda d: d["metric"] == "error-contribution"]
          [lambda d: ~d["model"].isin(fixed_features)]
          .sort_values(by = "ci_lower", ascending=False)
      )
    else:
      features_to_remove = (
          run_logs
          [lambda d: d["ci_upper"] > 0]
          [lambda d: d["metric"] == "error-contribution"]
          [lambda d: ~d["model"].isin(fixed_features)]
          .sort_values(by = "ci_upper", ascending=False)
      )


    if max_removals_per_run is not None:
      features_to_remove = features_to_remove.iloc[:max_removals_per_run]

    features_to_remove = features_to_remove["model"].values.tolist() #model means the model without the feature

    run_logs["run_index"] = i
    run_logs["n_features"] = (run_logs["metric"] == "error-contribution").sum()
    run_logs["removed_features"] = str(features_to_remove)
    run_logs["n_features_removed"] = len(features_to_remove)
    all_logs.append(run_logs)

    if len(features_to_remove) == 0:
      if patience:
        if p >= patience:
          break
        else:
          p+=1
          continue
      else:
        break

    #update features for the next iteration
    candidate_features_for_removal = [i for i in candidate_features_for_removal if not i in features_to_remove]

    #update counters
    p=0

  #calculate fs logs for full set of features in case of sub sampling
  if (n_features_sample is not None) and (len(all_features) > n_features_sample):
      run_logs = _backwards_shap_feature_selection(
          model=clone(model),
          df_train=df_train,
          df_val=df_val,
          all_features=all_features,
          extra_validation_sets=extra_validation_sets,
          target=target,
          sample_weight=sample_weight,
          metric_fn=metric_fn,
          bootstrap_samples=bootstrap_samples,
          ci_level=ci_level,
      )
      run_logs["run_index"] = i + 1
      run_logs["n_features"] = len(all_features)
      run_logs["removed_features"] = str([])
      run_logs["n_features_removed"] = 0
      all_logs.append(run_logs)

  return pd.concat(all_logs, ignore_index=True)


def _backwards_shap_feature_selection(
    model,
    df_train,
    df_val,
    all_features,
    extra_validation_sets,
    target,
    sample_weight,
    metric_fn,
    bootstrap_samples,
    ci_level,
):

  #train model
  model.fit(
      df_train[all_features],
      df_train[target],
      sample_weight=sample_weight
  )


  #calculate shap
  explainer = shap.TreeExplainer(model)
  shap_values_val = explainer.shap_values(df_val[all_features])[-1]

  #make raw preds
  raw_preds_val = proba_to_log_odds(model.predict_proba(df_val[all_features])[:,-1])

  #score without feature
  scores_df = pd.DataFrame(
      log_odds_to_proba(raw_preds_val.reshape(-1,1) - shap_values_val),
      columns = all_features
  )

  #add extra columns
  scores_df["val_set"] = raw_preds_val
  scores_df[target] = df_val[target].values
  if sample_weight is not None:
    df_val[sample_weight].values


  #deltas
  error_contributions_with_ci = fast_delta_metric_with_ci(
      scores_df,
      baseline="val_set",
      challengers=all_features,
      n_samples=bootstrap_samples,
      ci_level=ci_level,
      target=target,
      weight=sample_weight,
      metric_fn = metric_fn
    ).assign(metric="error-contribution")

  #current setup metric
  metric = fast_metric_with_ci(
      scores_df,
      predictions=["val_set"],
      n_samples=bootstrap_samples,
      ci_level=ci_level,
      target=target,
      weight=sample_weight,
      metric_fn = metric_fn
    ).assign(metric="metric", used_features=str(all_features))

  extra_val_logs = []
  for k,d in extra_validation_sets.items():
    extra_val_logs.append(
        fast_metric_with_ci(
          d.assign(**{k:lambda d: model.predict_proba(d[all_features])[:,-1], "weight__":lambda d: d[sample_weight] if sample_weight is not None else 1}),
          predictions=[k],
          n_samples=bootstrap_samples,
          ci_level=ci_level,
          target=target,
          weight="weight__",
          metric_fn = metric_fn
      ).assign(metric="metric", used_features=str(all_features))
    )



  return pd.concat([error_contributions_with_ci, metric, *extra_val_logs], ignore_index = True)

# Feature Selection

In [10]:
def with_feature_group(df):
  df["feature_group"] = np.nan
  df.loc[lambda d: d.model.isin(tweet_emb_names + tsvd_tweet_embs + ssvd_tweet_embs), "feature_group"] = "tweet_embbeding"
  df.loc[lambda d: d.model.isin(description_emb_names + tsvd_description_embs + ssvd_description_embs), "feature_group"] = "description_embbeding"
  df.loc[lambda d: d.model.isin(name_emb_names + tsvd_name_embs + ssvd_name_embs), "feature_group"] = "name_embbeding"
  df.loc[lambda d: d.model.isin(screen_name_emb_names + tsvd_screen_name_embs + ssvd_screen_name_embs), "feature_group"] = "screen_name_embbeding"
  df.loc[lambda d: d.model.isin(categorical_features), "feature_group"] = "categorical"
  df.loc[lambda d: d.model.isin(numerical_features), "feature_group"] = "numerical"
  df.loc[lambda d: d.model.isin(graph_features), "feature_group"] = "network_features"
  return df


## Supervised SVD

### with verified

In [20]:
# from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score

ssvd_fs_logs = backwards_shap_feature_selection(
    LGBMClassifier(n_jobs = -1, min_samples_leaf = 5, verbose = -1),
    df[train_mask],
    df[val_mask],
    candidate_features_for_removal = all_features_ssvd,
    target=target,
    null_hypothesis="feature_is_good",
    # n_features_sample=40,
    extra_validation_sets={"test_set": df[test_mask]},
    fixed_features=[],
    sample_weight=None,
    metric_fn = f1_score,
    bootstrap_samples=30,
    ci_level=0.8,
    max_iter=50,
    patience=2,
    max_removals_per_run=None
)


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray



In [25]:


ssvd_fs_logs.to_parquet(data_folder_path/"feature_selection_ssvd_logs.parquet")
ssvd_fs_logs[lambda d: d.model == "test_set"]

Unnamed: 0,estimate,ci_upper,ci_lower,model,metric,used_features,run_index,n_features,removed_features,n_features_removed,feature_group
135,0.930449,0.940052,0.922639,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",0,134,"['ssvd_screen_name_embs3', 'ssvd_name_embs10',...",6,
265,0.932315,0.942063,0.921715,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",1,128,"['ssvd_name_embs19', 'ssvd_description_embs3',...",5,
390,0.93063,0.945141,0.919867,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",2,123,"['ssvd_description_embs12', 'ssvd_screen_name_...",17,
498,0.936025,0.942457,0.928213,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",3,106,[],0,
606,0.936025,0.944712,0.928344,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",4,106,['ssvd_tweets_embs1'],1,
713,0.932623,0.939849,0.923839,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",5,105,"['ssvd_name_embs5', 'ssvd_name_embs6', 'ssvd_d...",5,
815,0.933427,0.940732,0.92432,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",6,100,"['ssvd_screen_name_embs2', 'ssvd_name_embs18',...",20,
897,0.930233,0.936674,0.923345,test_set,metric,"['tenure', 'tweet_time_density', 'follower_tim...",7,80,"['followed_embs4', 'followed_embs8', 'ssvd_twe...",6,
973,0.93227,0.941052,0.926429,test_set,metric,"['tenure', 'tweet_time_density', 'follower_tim...",8,74,"['ssvd_tweets_embs11', 'ssvd_name_embs15', 'ss...",10,
1039,0.931069,0.94108,0.920509,test_set,metric,"['tenure', 'tweet_time_density', 'follower_tim...",9,64,"['ssvd_screen_name_embs16', 'ssvd_name_embs11'...",3,


In [22]:
pd.options.plotting.backend = 'plotly'

(
    ssvd_fs_logs.pipe(with_feature_group)
    .groupby(["run_index", "feature_group"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="feature_group", y="estimate", title="error contribution in Feature Selection runs (3)", width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

In [23]:
(
    ssvd_fs_logs.pipe(with_feature_group)
    [lambda d: d.metric=="metric"]
    .groupby(["run_index", "model"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="model", y="estimate", title="AUC in Feature Selection runs (3)", width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

### w/out verified

In [48]:
# from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

all_features_ssvd_no_verified = [i for i in all_features_ssvd if not i == "verified"]

ssvd_no_verified_fs_logs = backwards_shap_feature_selection(
    LGBMClassifier(n_jobs = -1, min_samples_leaf = 5, verbose = -1),
    df[train_mask],
    df[val_mask],
    candidate_features_for_removal = all_features_ssvd_no_verified,
    target=target,
    null_hypothesis="feature_is_good",
    # n_features_sample=40,
    extra_validation_sets={"test_set": df[test_mask]},
    fixed_features=[],
    sample_weight=None,
    metric_fn = roc_auc_score,
    bootstrap_samples=30,
    ci_level=0.8,
    max_iter=50,
    patience=2,
    max_removals_per_run=None
)


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray



In [49]:


ssvd_no_verified_fs_logs.to_parquet(data_folder_path/"feature_selection_ssvd_no_verified_logs.parquet")
ssvd_no_verified_fs_logs[lambda d: d.model == "test_set"]

Unnamed: 0,estimate,ci_upper,ci_lower,model,metric,used_features,run_index,n_features,removed_features,n_features_removed
134,0.888127,0.897475,0.881544,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",0,133,"['followed_embs3', 'ssvd_screen_name_embs13', ...",8
261,0.891285,0.903114,0.878342,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",1,125,"['follow_embs3', 'follow_embs2', 'ssvd_screen_...",10
378,0.889185,0.901237,0.877202,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",2,115,"['ssvd_name_embs10', 'follow_embs1', 'ssvd_des...",5
490,0.890178,0.899401,0.881794,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",3,110,"['ssvd_screen_name_embs8', 'ssvd_screen_name_e...",8
594,0.892744,0.901638,0.887853,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",4,102,"['ssvd_screen_name_embs5', 'ssvd_name_embs14',...",8
690,0.890026,0.902539,0.8814,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",5,94,"['ssvd_screen_name_embs1', 'ssvd_screen_name_e...",2
784,0.890841,0.906088,0.876916,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",6,92,['ssvd_tweets_embs18'],1
877,0.888745,0.897212,0.876432,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",7,91,"['friend_embs2', 'ssvd_name_embs0']",2
968,0.893406,0.901245,0.878945,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",8,89,"['friend_embs7', 'followed_embs5']",2
1057,0.890107,0.89755,0.880674,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",9,87,[],0


In [50]:
pd.options.plotting.backend = 'plotly'

(
    ssvd_no_verified_fs_logs.pipe(with_feature_group)
    .groupby(["run_index", "feature_group"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="feature_group", y="estimate", title="error contribution in Feature Selection runs (3)", width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

In [51]:
(
    ssvd_no_verified_fs_logs.pipe(with_feature_group)
    [lambda d: d.metric=="metric"]
    .groupby(["run_index", "model"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="model", y="estimate", title="AUC in Feature Selection runs (3)", width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

## Truncated SVD

### with verified

In [14]:
# from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

tsvd_fs_logs = backwards_shap_feature_selection(
    LGBMClassifier(n_jobs = -1, min_samples_leaf = 5, verbose = -1),
    df[train_mask],
    df[val_mask],
    candidate_features_for_removal = all_features_tsvd,
    target=target,
    null_hypothesis="feature_is_good",
    n_features_sample=40,
    extra_validation_sets={"test_set": df[test_mask]},
    fixed_features=[],
    sample_weight=None,
    metric_fn = roc_auc_score,
    bootstrap_samples=30,
    ci_level=0.8,
    max_iter=50,
    patience=2,
    max_removals_per_run=None
)

  0%|          | 0/50 [00:00<?, ?it/s]



LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray




  2%|▏         | 1/50 [00:53<43:26, 53.19s/it]



LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray




  4%|▍         | 2/50 [01:36<37:52, 47.35s/it]



LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray




  6%|▌         | 3/50 [02:16<34:22, 43.89s/it]



LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray




  8%|▊         | 4/50 [02:55<32:06, 41.89s/it]



LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray




  8%|▊         | 4/50 [03:35<41:15, 53.81s/it]


In [20]:
tsvd_fs_logs.to_parquet(data_folder_path/"feature_selection_tsvd_logs.parquet")
tsvd_fs_logs[lambda d: d.model == "val_set"]

Unnamed: 0,estimate,ci_upper,ci_lower,model,metric,used_features,run_index,n_features,removed_features,n_features_removed
134,0.91579,0.924993,0.90483,val_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",0,134,"['follow_embs2', 'tsvd_screen_name_embs3', 'fo...",29
241,0.917836,0.924983,0.909066,val_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",1,105,"['followed_embs0', 'tsvd_description_embs12', ...",8
340,0.918871,0.925051,0.911026,val_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",2,97,[],0
439,0.918871,0.928316,0.911697,val_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",3,97,[],0
538,0.918871,0.924443,0.910295,val_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",4,97,[],0


In [119]:
(
    tsvd_fs_logs.pipe(with_feature_group)
    .groupby(["run_index", "feature_group"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="feature_group", y="estimate", title="error contribution in Feature Selection runs (2)" , width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

In [120]:
(
    tsvd_fs_logs.pipe(with_feature_group)
    [lambda d: d.metric=="metric"]
    .groupby(["run_index", "model"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="model", y="estimate", title="AUC in Feature Selection runs (2)" , width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

### w/out verified

In [52]:
# from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

all_features_tsvd_no_verified = [i for i in all_features_tsvd if not i == "verified"]

tsvd_no_verified_fs_logs = backwards_shap_feature_selection(
    LGBMClassifier(n_jobs = -1, min_samples_leaf = 5, verbose = -1),
    df[train_mask],
    df[val_mask],
    candidate_features_for_removal = all_features_tsvd_no_verified,
    target=target,
    null_hypothesis="feature_is_good",
    # n_features_sample=40,
    extra_validation_sets={"test_set": df[test_mask]},
    fixed_features=[],
    sample_weight=None,
    metric_fn = roc_auc_score,
    bootstrap_samples=30,
    ci_level=0.8,
    max_iter=50,
    patience=2,
    max_removals_per_run=None
)


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray


LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray



In [53]:


tsvd_no_verified_fs_logs.to_parquet(data_folder_path/"feature_selection_tsvd_no_verified_logs.parquet")
tsvd_no_verified_fs_logs[lambda d: d.model == "test_set"]

Unnamed: 0,estimate,ci_upper,ci_lower,model,metric,used_features,run_index,n_features,removed_features,n_features_removed
134,0.886018,0.896752,0.875134,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",0,133,"['tsvd_tweets_embs1', 'tsvd_screen_name_embs5'...",16
253,0.887169,0.896376,0.874027,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",1,117,['tsvd_screen_name_embs6'],1
371,0.889109,0.899753,0.876407,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",2,116,"['tsvd_name_embs10', 'tsvd_name_embs5', 'tsvd_...",3
486,0.883313,0.894184,0.867417,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",3,113,"['tsvd_description_embs12', 'followed_embs4', ...",7
594,0.890655,0.905072,0.879426,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",4,106,"['tsvd_name_embs11', 'followed_embs6']",2
700,0.889255,0.89974,0.880957,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",5,104,"['follow_embs1', 'tsvd_screen_name_embs10', 't...",6
800,0.888837,0.902204,0.880914,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",6,98,"['tsvd_screen_name_embs2', 'tsvd_screen_name_e...",6
894,0.89155,0.902443,0.883132,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",7,92,"['tsvd_name_embs0', 'friend_embs8']",2
986,0.889052,0.899761,0.878922,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",8,90,"['tsvd_tweets_embs0', 'friend_embs1', 'tsvd_tw...",3
1075,0.886316,0.900555,0.875365,test_set,metric,"['followers_follow_proportion', 'tenure', 'twe...",9,87,['follow_embs8'],1


In [56]:
pd.options.plotting.backend = 'plotly'

(
    tsvd_no_verified_fs_logs.pipe(with_feature_group)
    .groupby(["run_index", "feature_group"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="feature_group", y="estimate", title="error contribution in Feature Selection runs (2)", width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

In [57]:
(
    tsvd_no_verified_fs_logs.pipe(with_feature_group)
    [lambda d: d.metric=="metric"]
    .groupby(["run_index", "model"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="model", y="estimate", title="AUC in Feature Selection runs (2)", width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

## Raw embbedings

In [31]:
# from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

raw_embs_logs = backwards_shap_feature_selection(
    LGBMClassifier(n_jobs = -1, min_samples_leaf = 5, verbose = -1),
    df[train_mask],
    df[val_mask],
    candidate_features_for_removal = all_features_raw,
    target=target,
    null_hypothesis="feature_is_good",
    n_features_sample=80,
    extra_validation_sets={"test_set": df[test_mask]},
    fixed_features=[],
    sample_weight=None,
    metric_fn = roc_auc_score,
    bootstrap_samples=30,
    ci_level=0.8,
    max_iter=100,
    patience=2,
    max_removals_per_run=None
)

  0%|          | 0/100 [00:00<?, ?it/s]LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
  1%|          | 1/100 [00:34<56:54, 34.49s/it]LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
  2%|▏         | 2/100 [01:09<56:21, 34.51s/it]LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
  3%|▎         | 3/100 [01:43<55:27, 34.31s/it]LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
  4%|▍         | 4/100 [02:17<55:14, 34.53s/it]LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
  5%|▌         | 5/100 [02:55<56:30, 35.69s/it]LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray
  6%|▌         | 6/100 [03:29<55:04, 35.16s/it]LightGBM binary classifier with TreeExplainer shap values output has changed to a list of n

In [33]:
raw_embs_logs.to_parquet(data_folder_path/"feature_selection_raw_embs_logs.parquet")
raw_embs_logs[lambda d: d.model == "test_set"]

Unnamed: 0,estimate,ci_upper,ci_lower,model,metric,used_features,run_index,n_features,removed_features,n_features_removed
81,0.752538,0.766280,0.737075,test_set,metric,['screen_name_emb_216' 'descrpition_emb_135' '...,0,80,"['screen_name_emb_182', 'screen_name_emb_476',...",5
163,0.756562,0.770199,0.738641,test_set,metric,['tweet_avg_emb_137' 'name_emb_594' 'tweet_avg...,1,80,"['screen_name_emb_550', 'name_emb_594', 'scree...",5
245,0.758693,0.779101,0.740530,test_set,metric,['screen_name_emb_426' 'tweet_avg_emb_476' 'sc...,2,80,"['name_emb_302', 'descrpition_emb_672', 'scree...",6
327,0.765713,0.783310,0.749457,test_set,metric,['name_emb_176' 'descrpition_emb_72' 'tweet_av...,3,80,"['descrpition_emb_246', 'screen_name_emb_592',...",8
409,0.749817,0.771482,0.738143,test_set,metric,['tweet_avg_emb_307' 'screen_name_emb_186' 'de...,4,80,"['screen_name_emb_4', 'screen_name_emb_538', '...",6
...,...,...,...,...,...,...,...,...,...,...
7953,0.852712,0.868094,0.837254,test_set,metric,['descrpition_emb_294' 'descrpition_emb_238' '...,96,80,"['name_emb_159', 'screen_name_emb_262', 'descr...",3
8035,0.776245,0.795323,0.757601,test_set,metric,['tweet_avg_emb_608' 'descrpition_emb_436' 'sc...,97,80,"['name_emb_287', 'screen_name_emb_113', 'scree...",3
8117,0.762228,0.782770,0.743146,test_set,metric,['name_emb_674' 'screen_name_emb_507' 'screen_...,98,80,"['screen_name_emb_629', 'tweet_avg_emb_51', 't...",9
8199,0.750873,0.780972,0.730102,test_set,metric,['descrpition_emb_731' 'descrpition_emb_364' '...,99,80,"['screen_name_emb_420', 'name_emb_572', 'tweet...",6


In [121]:
(
    raw_embs_logs.pipe(with_feature_group)
    .groupby(["run_index", "feature_group"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="feature_group", y="estimate", title="error contribution in Feature Selection runs (1)" , width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

In [122]:
(
    raw_embs_logs.pipe(with_feature_group)
    [lambda d: d.metric=="metric"]
    .groupby(["run_index", "model"])
    [["estimate","ci_lower","ci_upper","n_features"]].sum()
    .reset_index()
    .plot.line(x="run_index", color="model", y="estimate", title="AUC in Feature Selection runs (1)" , width=1200, height=600)
    # .pipe(sns.lineplot, x="run_index", hue="feature_group", y="estimate")
)

# Train Final models

In [58]:
ssvd_fs_logs = pd.read_parquet(data_folder_path/"feature_selection_ssvd_logs.parquet")
tsvd_fs_logs = pd.read_parquet(data_folder_path/"feature_selection_tsvd_logs.parquet")
raw_fs_logs = pd.read_parquet(data_folder_path/"feature_selection_raw_embs_logs.parquet")

ssvd_no_verified_fs_logs = pd.read_parquet(data_folder_path/"feature_selection_ssvd_no_verified_logs.parquet")
tsvd_no_verified_fs_logs = pd.read_parquet(data_folder_path/"feature_selection_tsvd_no_verified_logs.parquet")

In [59]:
model_3_features = ssvd_fs_logs[lambda d: d.model == "test_set"][lambda d: d.run_index == d.run_index.max()].used_features.apply(eval).iloc[0]
model_2_features = tsvd_fs_logs[lambda d: d.model == "test_set"][lambda d: d.run_index == d.run_index.max()].used_features.apply(eval).iloc[0]
model_1_features = raw_fs_logs[lambda d: d.model == "test_set"][lambda d: d.run_index == d.run_index.max()].used_features.apply(eval).iloc[0]

model_3a_features = ssvd_no_verified_fs_logs[lambda d: d.model == "test_set"][lambda d: d.run_index == d.run_index.max()].used_features.apply(eval).iloc[0]
model_2a_features = tsvd_no_verified_fs_logs[lambda d: d.model == "test_set"][lambda d: d.run_index == d.run_index.max()].used_features.apply(eval).iloc[0]


In [60]:
from lightgbm import early_stopping

lgbm_params = {
    "n_estimators":400,
    "learning_rate":0.01,
    "n_jobs":-1
}

model_1 = LGBMClassifier(**lgbm_params)
model_2 = LGBMClassifier(**lgbm_params)
model_3 = LGBMClassifier(**lgbm_params)
model_2a = LGBMClassifier(**lgbm_params)
model_3a = LGBMClassifier(**lgbm_params)

In [47]:
import joblib
full_train_mask = lambda d: train_mask(d)|val_mask(d)

In [127]:
model_1.fit(
    df[full_train_mask][model_1_features],
    df[full_train_mask][target],
    eval_set = [
        (df[val_mask][model_1_features], df[val_mask][target]),
        ],
    callbacks = [early_stopping(3)]
)

[LightGBM] [Info] Number of positive: 3632, number of negative: 4646
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.685695 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 644146
[LightGBM] [Info] Number of data points in the train set: 8278, number of used features: 2534
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.438753 -> initscore=-0.246223
[LightGBM] [Info] Start training from score -0.246223
Training until validation scores don't improve for 3 rounds
Did not meet early stopping. Best iteration is:
[400]	valid_0's binary_logloss: 0.177464


In [128]:
joblib.dump(model_1, data_folder_path/"final_model_1.pkl")

['gdrive/MyDrive/Colab Notebooks/MO436/data/final_model_1.pkl']

In [129]:
model_2.fit(
    df[full_train_mask][model_2_features],
    df[full_train_mask][target],
    eval_set = [
        (df[val_mask][model_2_features], df[val_mask][target]),
        ],
    callbacks = [early_stopping(3)]
)

[LightGBM] [Info] Number of positive: 3632, number of negative: 4646
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009549 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 22452
[LightGBM] [Info] Number of data points in the train set: 8278, number of used features: 94
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.438753 -> initscore=-0.246223
[LightGBM] [Info] Start training from score -0.246223
Training until validation scores don't improve for 3 rounds
Did not meet early stopping. Best iteration is:
[400]	valid_0's binary_logloss: 0.226234


In [130]:
joblib.dump(model_2, data_folder_path/"final_model_2.pkl")

['gdrive/MyDrive/Colab Notebooks/MO436/data/final_model_2.pkl']

In [38]:
model_3.fit(
    df[full_train_mask][model_3_features],
    df[full_train_mask][target],
    eval_set = [
        (df[val_mask][model_3_features], df[val_mask][target]),
        ],
    callbacks = [early_stopping(3)]
)

[LightGBM] [Info] Number of positive: 3632, number of negative: 4646
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13529
[LightGBM] [Info] Number of data points in the train set: 8278, number of used features: 60
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.438753 -> initscore=-0.246223
[LightGBM] [Info] Start training from score -0.246223
Training until validation scores don't improve for 3 rounds
Did not meet early stopping. Best iteration is:
[400]	valid_0's binary_logloss: 0.229847


In [44]:
joblib.dump(model_3, data_folder_path/"final_model_3.pkl")

['gdrive/MyDrive/Colab Notebooks/MO436/data/final_model_3.pkl']

In [61]:
model_3a.fit(
    df[full_train_mask][model_3a_features],
    df[full_train_mask][target],
    eval_set = [
        (df[val_mask][model_3a_features], df[val_mask][target]),
        ],
    callbacks = [early_stopping(3)]
)
joblib.dump(model_3a, data_folder_path/"final_model_3a.pkl")

[LightGBM] [Info] Number of positive: 3632, number of negative: 4646
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.173479 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17603
[LightGBM] [Info] Number of data points in the train set: 8278, number of used features: 73
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.438753 -> initscore=-0.246223
[LightGBM] [Info] Start training from score -0.246223
Training until validation scores don't improve for 3 rounds
Did not meet early stopping. Best iteration is:
[400]	valid_0's binary_logloss: 0.319368


['gdrive/MyDrive/Colab Notebooks/MO436/data/final_model_3a.pkl']

In [63]:
model_2a.fit(
    df[full_train_mask][model_2a_features],
    df[full_train_mask][target],
    eval_set = [
        (df[val_mask][model_2a_features], df[val_mask][target]),
        ],
    callbacks = [early_stopping(3)]
)
joblib.dump(model_2a, data_folder_path/"final_model_2a.pkl")

[LightGBM] [Info] Number of positive: 3632, number of negative: 4646
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15312
[LightGBM] [Info] Number of data points in the train set: 8278, number of used features: 66
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.438753 -> initscore=-0.246223
[LightGBM] [Info] Start training from score -0.246223
Training until validation scores don't improve for 3 rounds
Did not meet early stopping. Best iteration is:
[400]	valid_0's binary_logloss: 0.318828


['gdrive/MyDrive/Colab Notebooks/MO436/data/final_model_2a.pkl']

In [65]:
df["pipeline_3a_score"] = model_3a.predict_proba(df[model_3a_features])[:,-1]
df["pipeline_2a_score"] = model_2a.predict_proba(df[model_2a_features])[:,-1]

df["pipeline_3_score"] = model_3.predict_proba(df[model_3_features])[:,-1]
df["pipeline_2_score"] = model_2.predict_proba(df[model_2_features])[:,-1]
df["pipeline_1_score"] = model_1.predict_proba(df[model_1_features])[:,-1]

In [66]:
df.to_parquet(data_folder_path/"final_scored_dataset.parquet")