# **CS282br Final Project**
Varshini Reddy,
Michael Cheng,
Matthew Nazari

In [1]:
import numpy as np
import numpy.random as npr
import pickle
import pandas as pd
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# import matplotlib.pyplot as plt
from tqdm import tqdm


**Datasets**
1. Cost $\in 13226 \times 2$
2. Pneumonia $\in 5856 \times 13228$
3. Pneumonia base $\in 1171 \times 13228$

In [2]:
# Datasets
dfs = dict(
  # pneumonia=pd.read_csv('data/pneumonia.csv', low_memory=False),
  # pneumonia_base=pd.read_csv('data/pneumonia_base_train.csv'),
  # unbalanced=pd.read_csv('data/total_data.csv'),
  balanced=pd.read_csv('data/balanced_data.csv'),
  cost=pd.read_csv('data/cost.csv'),
  mean_features=pd.read_csv('data/mean_features.csv'),
)

In [3]:
# Dataframe utilites

## split a dataframe into x_train and y_train for logistic regression
def datafy (x: pd.DataFrame, y: pd.DataFrame):
  return x.values, y.values.ravel()

def split_df(df: pd.DataFrame, ignore_cols: list[str] = [], ret_df: bool = True):
  kept_cols = df.columns != 'class'
  for c in ignore_cols:
    kept_cols &= (df.columns != c)
  x, y = df.loc[:, kept_cols], df.loc[:, df.columns == 'class']
  return (x, y) if ret_df else datafy(x, y)

def random_rows(x: np.ndarray, y: np.ndarray, p: float):
  assert 0 <= p <= 1, "random_rows: invalid percentage"
  assert x.shape[0] == y.shape[0], "random_rows: x.shape[0] != y.shape[0]"
  idxs = npr.choice(x.shape[0], int(x.shape[0]*p), replace=False)
  return x[idxs, :], y[idxs]

# def logreg_score(x: np.ndarray, y: np.ndarray, p: float = 0.05):
#   x_train, y_train = random_rows(x, y, p)
#   lr = LogisticRegression().fit(x_train, y_train)
#   return lr.score(x, y)

**Rewards**

In [4]:
def get_updated_data(df: pd.DataFrame, features: list[str]):
    df = df.copy()
    for f in features:
        df[f] = dfs['mean_features'][f].item()            
    return df

lr_filename="data/logistic_model.sav"
lr_model = pickle.load(open(lr_filename, 'rb'))

def get_reward(features, df, model=lr_model):
    x, y = split_df(df)
    x = get_updated_data(x, features)
    x, y = datafy(x, y)
    y_pred = model.predict(x)
    reward = accuracy_score(y, y_pred)
    scale = 0.05
    cost = dfs['cost'][features].values.sum()**scale

    return (reward*100)/cost


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


**Feature Selection Algorithms**
- All features
- Random drop 50%
- Random drop 75%
- CMAB-GFS [(epubs.siam.org/doi/pdf/10.1137/1.9781611976700.36)](https://epubs.siam.org/doi/pdf/10.1137/1.9781611976700.36)

In [5]:
# Feature Selection Algorithms

## All features
all_features = lambda df: split_df(df)[0].columns

## Random drop 50% and 75%
def random_drop(df: pd.DataFrame, p: float):
  assert 0 <= p <= 1, "random_drop: invalid percentage"
  x, _ = split_df(df)
  return npr.choice(x.columns, x.shape[0]*p)

random_drop_50 = lambda df: random_drop(df, 0.50)
random_drop_75 = lambda df: random_drop(df, 0.75)

## CMAB-GFS
def generative_oracle (M, K, betas, eps):
  if npr.rand() < eps:
    return npr.choice(M, K)
  else:
    samples = {c: npr.beta(*params) for c, params in betas.items()}
    return [k for k, _ in sorted(samples.items(), key=lambda item: item[1], reverse=True)][:K]

def cmab_gfs(df, K, T, R, eps):
  assert 0 <= eps <= 1, "cmab_gfs: eps must be in [0, 1]"
  x, _ = split_df(df)
  M = x.columns
  betas = {c: (0.5, 0.5) for c in M}
  S_old = npr.choice(M, K)
  r_old = R(S_old)
  history = []
  for _ in tqdm(range(T)):
    history.append(r_old)
    S_new = generative_oracle(M, K, betas, eps)
    r_new = R(S_new)
    for i in np.union1d(S_old, S_new):
      a, b = betas[i]
      betas[i] = (a+1, b) if r_new > r_old else (a, b+1)
    S_old, r_old = S_new, r_new
    history.append(dict(features=S_new, reward=r_new))
  return S_new, history
    

In [6]:
# Run CMAB-GFS algorithm for 10% .. 100% of features and save to './results/'
df = dfs['balanced']
x, _ = split_df(df)
total_features = len(x.columns)
for i in range(1, 11):
  K = int(i/10 * total_features)
  _, history = cmab_gfs(df, K=K, T=500, R=lambda f: get_reward(f, df), eps=0.10)
  with open(f'results/{i/10:.1f}_features_balanced.pickle', 'wb') as f:
    pickle.dump(history, f)
    

100%|██████████| 500/500 [20:27<00:00,  2.46s/it]
100%|██████████| 500/500 [22:56<00:00,  2.75s/it]
100%|██████████| 500/500 [25:18<00:00,  3.04s/it]
100%|██████████| 500/500 [23:57<00:00,  2.88s/it]
100%|██████████| 500/500 [23:49<00:00,  2.86s/it]
100%|██████████| 500/500 [24:25<00:00,  2.93s/it]
100%|██████████| 500/500 [25:05<00:00,  3.01s/it]
100%|██████████| 500/500 [26:19<00:00,  3.16s/it]
 59%|█████▊    | 293/500 [15:01<10:58,  3.18s/it]

In [None]:
# Load results from './results/'
results = {}
for i in range(1, 11):
  with open(f'results/{i/10:.1f}_features.pickle', 'rb') as handle:
      results[i] = pickle.load(handle)
