In [None]:
!nvidia-smi

In [None]:
%tensorflow_version 1.x  # needed on Google Colab to use tf1

In [None]:
TRAIN_SENTIMENT = False  # to train the sentiment model instead of the selector model, set to True and re-run notebook

In [None]:
from google.colab import auth  # needed on Google Colab to use gsutil
auth.authenticate_user()

project_id = ""  # INSERT YOUR GOOGLE CLOUD PROJECT ID HERE
!gcloud config set project {project_id}

In [None]:
%cd /
!git clone https://github.com/nostalgebraist/nostalgebraist-autoresponder.git
    
%cd /nostalgebraist-autoresponder/

`config_file_gs_uri` should be a GS path to the private file `config.json` loaded by the `BotSpecificConstants` class.

I store it in GS, but in principle you can store it wherever you like.

In [None]:
config_file_gs_uri = ""

!gsutil cp {config_file_gs_uri} .

In [None]:
import os
import subprocess
from autoresponder_config import BUCKET_NAME, gs_command_get_model, gs_command_get_encoder, model_path

ckpt_dir = "/" + model_path.rpartition("/")[0]

if not os.path.exists(model_path):
  os.makedirs(ckpt_dir, exist_ok=True)

  subprocess.check_output(gs_command_get_encoder, shell=True)
  subprocess.check_output(gs_command_get_model, shell=True)

In [None]:
import os, sys

sys.path.append("gpt-2/")
sys.path.append("gpt-2/src")
sys.path.append("selector_model/")

In [None]:
import encoder
enc = encoder.get_encoder_from_path(ckpt_dir, eot_workaround=True)

In [None]:
!gsutil cp gs://{BUCKET_NAME}/selector_training_data.pkl.gz .

In [None]:
import pickle
import pandas as pd

data_path = "selector_training_data.pkl.gz"
with open(data_path, "rb") as f:
    selector_training_data = pickle.load(f)

Data prep

TODO: clean up this part of the code (and move out of notebook)

In [None]:
import re
from autoresponder_static import *

def inverse_format_post_for_api(post):
    if post.startswith("<p>"):
        post = post[len("<p>"):]
    if post.endswith("</p>"):
        post = post[:-len("</p>")]
    post = re.sub(r"</p><p>", "\n", post)
    post = re.sub(r"<br>", "\n", post)
    return post

def make_train_data(ids_to_reward_data,
                    continuation_only=True,
                    prompt_as_col=True,
                    v8_ts=True,
                    v10_ts=True,
                    nov_2020_new_way=True,
                    min_tok_len=None,
                    exclude_unames={"bukbot", "enriquebot"},
                    exclude_substrings={"graph of my mood", 
                                        T_CHAR+"#quotes<|",
                                        }
                    ):
    train_data = []
    cols=["id", "timestamp", "text", "note_count", ]
    if prompt_as_col:
      cols.append("prompt")
    if v8_ts:
      cols.append("v8_timestamp")
    if v10_ts:
      cols.append("v10_timestamp")
    if nov_2020_new_way:
      cols.extend(["is_orig", "is_ask", "is_reply", "is_reblog"])

    for k, v in ids_to_reward_data.items():
      if v.get("note_count") is None or v.get('continuation') is None:
        continue
      if continuation_only:
        train_data.append([k, v["timestamp"], v["continuation"], v["note_count"]])
      else:
        train_data.append([k, v["timestamp"], " ".join(v["prompt"].split(" ")[-64:]) + v["continuation"], v["note_count"]])
      if prompt_as_col:
        train_data[-1].append(v["prompt"])
        
      if v8_ts:
        train_data[-1].append(v["v8_timestamp"])
      if v8_ts:
        train_data[-1].append(v["v10_timestamp"])

      if nov_2020_new_way:
        train_data[-1].extend([v[c] for c in ["is_orig", "is_ask", "is_reply", "is_reblog"]])
        
    train_data = pd.DataFrame(train_data, columns=cols)

    train_data.text = train_data.text.apply(inverse_format_post_for_api)
    train_data.text = train_data.text.apply(lambda s: s.lstrip("\n"))

    if min_tok_len is not None:
      tok_len = train_data.text.apply(lambda s: len(enc.encode(s)))
      train_data = train_data[tok_len >= min_tok_len]

    for uname in exclude_unames:
      nbefore=len(train_data)
      train_data = train_data[train_data.prompt.apply(lambda s: uname+Q_CHAR not in s)]
      nafter=len(train_data)
      print(f"excluding {nbefore-nafter} with uname {repr(uname)}")

    for subs in exclude_substrings:
      nbefore=len(train_data)
      train_data = train_data[train_data.text.apply(lambda s: subs not in s)]
      nafter=len(train_data)
      print(f"excluding {nbefore-nafter} with substring {repr(subs)}")
    return train_data

In [None]:
# vs 16929 | vs 16910
train_data = make_train_data(selector_training_data, continuation_only=True, )
train_data = train_data[train_data.is_reblog == False]   # 2/7/21
train_data.note_count.describe()

In [None]:
import re
V10_ASK_CHAR = "要"

def find_all_control_chars_chinese(text,
                               incl_number=True  # ignored
                               ):
    results = []
    control_chars = [Q_CHAR, A_CHAR, UNAME_CHAR, ORIG_POST_CHAR_CHINESE, V10_ASK_CHAR] # no tchar
    rx = f"({UNAME_CHAR}[^{Q_CHAR}]+{Q_CHAR}|{A_CHAR}|{ORIG_POST_CHAR_CHINESE}|{V10_ASK_CHAR})"
    for m in re.finditer(rx, text):
        results.append((m.group(1), m.span(1)[0]))
    return results



In [None]:
def count_control_char_gaps(doc, forumlike=False):
    if forumlike:
        chars = find_control_chars_forumlike(doc)
    else:
        chars = find_all_control_chars_chinese(doc)
    
    gaps = []
    for char1, char2 in zip(chars[:-1], chars[1:]):
        gap = char2[1] - (char1[1] + len(char1[0]))
        gaps.append(gap)
        
    return gaps

def count_small_control_char_gaps(doc, forumlike=False):
    cutoff = 0 if forumlike else 2
    gaps = count_control_char_gaps(doc, forumlike)
    return sum([gap <= cutoff for gap in gaps])

In [None]:
train_data["n_small_gaps"] = train_data.prompt.apply(lambda p: count_small_control_char_gaps(p, forumlike=False))
train_data["has_small_gaps"] = (train_data.n_small_gaps > 0)

display(train_data.has_small_gaps.agg(["count", "sum", "mean"]))

train_data = train_data[train_data.has_small_gaps==False]

display(train_data.has_small_gaps.agg(["count", "sum", "mean"]))


In [None]:
SEPARATE_REBLOGS_IN_FINALCHAR = True

cchars_prompt = train_data.prompt.apply(lambda s: [tup[0] for tup in sorted(find_all_control_chars_chinese(s), key=lambda tup_: tup_[1])])

cats=cchars_prompt.apply(
    lambda l: "orig" if l[0]==ORIG_POST_CHAR_CHINESE else 
    ("reblog_orig" if l[0]==A_CHAR else 
     ("ask" if (l[0].startswith(UNAME_CHAR) or l[0].startswith(V10_ASK_CHAR)) and len(l)==2 else "reblog_ask"))
    )

train_data["control_cat"] = cats

if SEPARATE_REBLOGS_IN_FINALCHAR:
  prompt_finalchar = cats.apply(lambda s: ORIG_POST_CHAR_CHINESE if s=="orig" else (f"{A_CHAR}a" if s=="ask" else f"{A_CHAR}r"))
else:
  prompt_finalchar = train_data.prompt.apply(lambda s: s[-1])

train_data["prompt_finalchar"] = prompt_finalchar

display(train_data.prompt_finalchar.value_counts(normalize=True))
display(train_data.prompt_finalchar.value_counts(normalize=False))

In [None]:
cchars_prompt = train_data.prompt.apply(lambda s: [tup[0] for tup in sorted(find_all_control_chars_chinese(s), key=lambda tup_: tup_[1])])

cats=cchars_prompt.apply(
    lambda l: "orig" if l[0]==ORIG_POST_CHAR_CHINESE else 
    ("reblog_orig" if l[0]==A_CHAR else 
     ("ask" if (l[0].startswith(UNAME_CHAR) or l[0].startswith(V10_ASK_CHAR)) and len(l)==2 else "reblog_ask"))
    )

train_data["control_cat"] = cats

display(train_data[train_data.note_count<=50].groupby("prompt_finalchar").note_count.describe())

train_data[train_data.note_count<=50].groupby("control_cat").note_count.describe()

In [None]:
train_data.groupby("prompt_finalchar")[['is_orig', 'is_ask', 'is_reply', 'is_reblog']].mean()

In [None]:
# drop some malformed stuff -- 2/8/21

filt_reblog_dash_miscategorized_as_orig = (train_data.prompt_finalchar=="翰") & (train_data.is_orig==False)
train_data = train_data[~filt_reblog_dash_miscategorized_as_orig]

In [None]:
train_data["log_id"] = train_data["id"].apply(np.log)
train_data["log_id_qcut"] = pd.qcut(train_data.log_id, 10, labels=False)

In [None]:
temporally_ordered_train_data = train_data.sort_values("timestamp")

In [None]:
temporally_ordered_train_data_orig = temporally_ordered_train_data[temporally_ordered_train_data.prompt_finalchar == "翰"]# .reset_index()
temporally_ordered_train_data_resp_a = temporally_ordered_train_data[temporally_ordered_train_data.prompt_finalchar == "域a"]# .reset_index()
temporally_ordered_train_data_resp_r = temporally_ordered_train_data[temporally_ordered_train_data.prompt_finalchar == "域r"]# .reset_index()

In [None]:
from copy import deepcopy

def non_overlapping_ma(array, width=31):
  return pd.Series([np.average(array[ix:ix+width], )
   for ix in range(0, len(array), width)
   if len(array)-ix > width/2])

def do_rolling_quantiles(temporally_ordered_train_data__, 
                         window_width=140, 
                         skip_n_most_recent=40,
                         allow_partial_windows=False,
                         window_frac_left=0.8,
                         unit="days",
                         ):
  temporally_ordered_train_data_ = deepcopy(temporally_ordered_train_data__)
  if unit == "days":
    temporally_ordered_train_data_ = temporally_ordered_train_data_.set_index("timestamp")
  else:
    temporally_ordered_train_data_ = temporally_ordered_train_data_.reset_index()

  window_halfw = window_width//2
  rolling_quantiles = {}
  rolling_advantages = {}
  rolling_medians = {}
  rolling_counts = {}

  if window_frac_left is not None:
    window_shift_left = -1*int(window_frac_left*window_width)
    window_shift_right = window_width + window_shift_left
  else:
    window_shift_left = -window_halfw
    window_shift_right = window_halfw

  if unit == "days":
    day_dt = pd.Timedelta(days=1)
    window_shift_left = window_shift_left * day_dt
    window_shift_right = window_shift_right * day_dt
    last_ix_allowed = temporally_ordered_train_data_.index.max() - (skip_n_most_recent*day_dt)
  else:
    last_ix_allowed = len(temporally_ordered_train_data_) - skip_n_most_recent

  if allow_partial_windows:
    if unit == "days":
      ixs = temporally_ordered_train_data_[:last_ix_allowed].index
    else:
      ixs = temporally_ordered_train_data_.index[:last_ix_allowed]
  else:
    if unit == "days":
      _first = temporally_ordered_train_data_.index.min()
      ixs = temporally_ordered_train_data_.loc[(_first-window_shift_left):(last_ix_allowed-window_shift_right)].index
    else:
      ixs = temporally_ordered_train_data_.index[(-window_shift_left):(last_ix_allowed-window_shift_right)]

  if unit == "days":
    _first = temporally_ordered_train_data_.index.min()
    _last = temporally_ordered_train_data_.index.max()
  else:
    _first = 0
    _last = len(temporally_ordered_train_data_)-1
  print(f"using ({ixs.min()} to {ixs.max()}) of ({_first} to {_last})")

  for ix in ixs:
    point_first_try = temporally_ordered_train_data_.loc[ix, :]
    if len(point_first_try.shape)>1:
      points = [point_first_try.iloc[i, :] for i in range(len(point_first_try))]
    else:
      points = [point_first_try]
    window = temporally_ordered_train_data_.loc[ix+window_shift_left:ix+window_shift_right, 'note_count']
    for point in points:
      try:
        rolling_quantiles[point["id"]] = (point["note_count"]>=window).mean()
      except Exception as e:
        display(ix)
        display(point_first_try)
        display(point)
        display(window)
        raise e
      rolling_advantages[point["id"]] = (point["note_count"]-window).mean()
      rolling_medians[point["id"]] = np.median(window)
      rolling_counts[point["id"]] = len(window)

  rolling_quantiles = pd.Series(rolling_quantiles)
  rolling_advantages = pd.Series(rolling_advantages)
  rolling_medians = pd.Series(rolling_medians)
  rolling_counts = pd.Series(rolling_counts)
  return rolling_quantiles, rolling_advantages, rolling_medians, rolling_counts



In [None]:
allow_partial_windows = False
window_frac_left = 0.9 # None

# ID BASED

# window_width_resp_a = 100
# skip_n_most_recent_resp_a = 0 # 20

# DAY BASED

window_width_resp_a = 14
skip_n_most_recent_resp_a = 1/12 # 20

rolling_quantiles_resp_a, rolling_advantages_resp_a, rolling_medians_resp_a, rolling_counts_resp_a = do_rolling_quantiles(
    temporally_ordered_train_data_resp_a,
    window_width=window_width_resp_a, 
    skip_n_most_recent=skip_n_most_recent_resp_a,
    allow_partial_windows=allow_partial_windows,
    window_frac_left=window_frac_left,
    )

train_data_resp_a_ = temporally_ordered_train_data_resp_a.set_index("id").loc[rolling_quantiles_resp_a.index]
train_data_resp_a_["rolling_quantile"] = rolling_quantiles_resp_a
train_data_resp_a_["rolling_advantage"] = rolling_advantages_resp_a
train_data_resp_a_["rolling_count"] = rolling_counts_resp_a

# non_overlapping_ma(rolling_quantiles_resp_a, width=len(rolling_quantiles_resp_a)//10+1).plot(lw=1, ls='--', marker='.', markersize=5, figsize=(10, 6));

In [None]:
allow_partial_windows = False
window_frac_left = 0.9 # None

# ID BASED

# window_width_resp_r = 100
# skip_n_most_recent_resp_r = 0 # 10

# DAY BASED

window_width_resp_r = 14
skip_n_most_recent_resp_r = 1/12 # 20

rolling_quantiles_resp_r, rolling_advantages_resp_r, rolling_medians_resp_r, rolling_counts_resp_r = do_rolling_quantiles(
    temporally_ordered_train_data_resp_r,
    window_width=window_width_resp_r, 
    skip_n_most_recent=skip_n_most_recent_resp_r,
    allow_partial_windows=allow_partial_windows,
    window_frac_left=window_frac_left,
    )

train_data_resp_r_ = temporally_ordered_train_data_resp_r.set_index("id").loc[rolling_quantiles_resp_r.index]
train_data_resp_r_["rolling_quantile"] = rolling_quantiles_resp_r
train_data_resp_r_["rolling_advantage"] = rolling_advantages_resp_r
train_data_resp_r_["rolling_count"] = rolling_counts_resp_r


In [None]:
allow_partial_windows = False
window_frac_left = 0.9 # None

# ID BASED 

# window_width_orig = 100
# skip_n_most_recent_orig = 0 # 10

# DAY BASED

window_width_orig = 42 # 14
skip_n_most_recent_orig = 1/12 # 10

rolling_quantiles_orig, rolling_advantages_orig, rolling_medians_orig, rolling_counts_orig = do_rolling_quantiles(
    temporally_ordered_train_data_orig,
    window_width=window_width_orig, 
    skip_n_most_recent=skip_n_most_recent_orig,
    allow_partial_windows=allow_partial_windows,
    window_frac_left=window_frac_left,
    )

train_data_orig_ = temporally_ordered_train_data_orig.set_index("id").loc[rolling_quantiles_orig.index]
train_data_orig_["rolling_quantile"] = rolling_quantiles_orig
train_data_orig_["rolling_advantage"] = rolling_advantages_orig
train_data_orig_["rolling_count"] = rolling_counts_orig

In [None]:
from copy import deepcopy

# notes_key = "note_count" 
notes_key = "rolling_quantile"
train_data_modified_list = []

drop_midrange = True
smaller_midrange_dropped = False

for name, train_data_base in [("resp_a", train_data_resp_a_),
                              ("resp_r", train_data_resp_r_),
                              ("orig", train_data_orig_)]:
  train_data_ = deepcopy(train_data_base)
  
  if smaller_midrange_dropped:
    MIDRANGE_BOTTOM = np.percentile(train_data_[notes_key], 30)
    MIDRANGE_TOP = np.percentile(train_data_[notes_key], 70)
    print((MIDRANGE_BOTTOM, MIDRANGE_TOP))
  else:
    MIDRANGE_BOTTOM = np.percentile(train_data_[notes_key], 25)
    MIDRANGE_TOP = np.percentile(train_data_[notes_key], 75)
    print((MIDRANGE_BOTTOM, MIDRANGE_TOP))

  

  train_data_["target"] = (train_data_[notes_key] >= MIDRANGE_TOP).astype(int)
  train_data_ = train_data_[(train_data_[notes_key] <= MIDRANGE_BOTTOM) | (train_data_[notes_key] >= MIDRANGE_TOP)]
  
  train_data_modified_list.append(train_data_)

train_data_ = pd.concat(train_data_modified_list, ignore_index=True)

stratify = train_data_["target"]

model_inputs = train_data_[["text", "target"]]

In [None]:
# vs 8066 | vs 8059
display(model_inputs.target.describe())
display(train_data_.groupby("prompt_finalchar").target.agg(["count", "mean"]))

In [None]:
if TRAIN_SENTIMENT:
  if not os.path.exists("sentiment_train_data.jsonl"):
    !gsutil cp gs://{BUCKET_NAME}/sentiment_train_data.jsonl .
  sentiment_train_data = pd.read_json("sentiment_train_data.jsonl",
                                      orient="records",
                                      lines=True)
  train_data_ = sentiment_train_data
  train_data_["prompt"] = train_data_.text

  prompt_finalchar = train_data_.prompt.apply(lambda s: s[-1])
  train_data_["prompt_finalchar"] = prompt_finalchar

  train_data_["logit_diff"] = train_data_.pos_logit - train_data_.neg_logit
  train_data_["target"] = (train_data_.logit_diff > 0).astype(int)
  
  train_data_["target_p75_generated"] = None
  filter_ = train_data_.p75_generated_logit_diff.notnull()
  train_data_.loc[filter_, "target_p75_generated"] = (train_data_[filter_].p75_generated_logit_diff > 0).astype(int)

In [None]:
if TRAIN_SENTIMENT:
  import re
  
  try:
    from transformers.tokenization_roberta import RobertaTokenizer
  except ModuleNotFoundError:
    !pip install transformers==2.11.0
    from transformers.tokenization_roberta import RobertaTokenizer

  rtok = RobertaTokenizer.from_pretrained("roberta-large")
  def _sc_prep(text):
    sanitized_text = re.sub(r"\<.*?\>", "", text)
    sanitized_text = rtok.convert_tokens_to_string(rtok.tokenize(sanitized_text)[:200])
    return sanitized_text

  train_data_["text_raw"] = train_data_.text
  train_data_["text"] = " " + train_data_.text_raw.apply(_sc_prep)

In [None]:
TEST_SIZE = 1/10

if TRAIN_SENTIMENT:
  TEST_SIZE = 0.125

Munging

In [None]:
from autoresponder_static_v8 import *
GLOBAL_DEBUG = False

In [None]:
from datetime import datetime, timedelta

def _addmonth(ts):
  _month = ts.strftime("%B")
  _newmonth = ts.strftime("%B")
  while _newmonth == _month:
    ts = ts + timedelta(days=14)
    _newmonth = ts.strftime("%B")
  return ts

def fake_v10_timestamps(ts_real, start=None, n=3):
  if start is None:
    start = datetime.now()
  tss = [start]
  for i in range(n):
    tss.append(_addmonth(tss[-1]))

  return [" ".join(ts_real.split(" ")[:2]) + ts.strftime(" %B %Y") for ts in tss]

def one_fake_v10_timestamp(ts_real, n=3):
  return np.random.choice(fake_v10_timestamps(ts_real, n=n))

In [None]:
from sklearn.model_selection import GroupKFold, train_test_split

V10 = True  #  EXPERIMENTAL
TIMESTAMP_FAKING = True  #  EXPERIMENTAL
V8 = True

TRUNC_DEFER_TO_EST = False

IGNORE_REBLOGS = True # was once EXPERIMENTAL

CONTINUATION_ONLY = False
LAST_USER_INPUT_ONLY = False

NO_TAGS = False
FORCE_T_CHAR = True
INCLUDE_PROMPT_FINALCHAR = True
NORMALIZE = True
TRUNCATE_AT_RIGHT = False  # EXPERIMENTAL
FORUMLIKE = True
FORUMLIKE_MODE = "train" # was once EXPERIMENTAL
EOT_PREPEND = True # was once EXPERIMENTAL

if V8:
  FORCE_T_CHAR = False
  EOT_PREPEND = True

if TRAIN_SENTIMENT:
  CONTINUATION_ONLY = True
  NO_TAGS = True
  FORCE_T_CHAR = False
  INCLUDE_PROMPT_FINALCHAR = False
  TRUNCATE_AT_RIGHT = True
  TRUNC_DEFER_TO_EST = False

def strip_uname_tags(s, prompt, verbose=False):
  post, optional_tchar, tagbody = s.partition(T_CHAR)
  tags = [t for t in tagbody.split("#") if len(t)>0]

  stripped_tags = []
  for t in tags:
    uname_substr = UNAME_CHAR + t.rstrip(" ")
    uname_substr2 = UNAME_CHAR + " " + t.rstrip(" ")
    if uname_substr not in prompt and uname_substr2 not in prompt:
        stripped_tags.append("#" + t)
    else:
      if verbose:
        print(f"found {t} as {uname_substr}")
  
  return post + optional_tchar + "".join(stripped_tags)

if V10:
  control_seg_config = CONTROL_SEG_CONFIGS["V10"]
  chinese_to_neural_munger = final_munge_before_neural_v10
  chinese_to_neural_munger_prompt = chinese_to_neural_munger
elif V8:
  control_seg_config = CONTROL_SEG_CONFIGS["V9"]
  chinese_to_neural_munger = final_munge_before_neural_v8
  chinese_to_neural_munger_prompt = chinese_to_neural_munger
else:
  chinese_to_neural_munger = lambda s: substitute_forumlike(normalize_for_generator(s), shuffle=False, infer_first=False, mode=FORUMLIKE_MODE)
  chinese_to_neural_munger_prompt = lambda s: substitute_forumlike(normalize_for_generator(s), shuffle=False, infer_first=False, mode="predict")

train_data_for_selection = train_data_.copy()
selector_input_continuation = train_data_for_selection.text.apply(lambda s: (s[:-2] if s.endswith("<|") else s))
selector_input_prompt = train_data_for_selection.prompt

if TIMESTAMP_FAKING and not TRAIN_SENTIMENT:
  train_data_for_selection['fake_v10_timestamp'] = train_data_for_selection.v10_timestamp.apply(one_fake_v10_timestamp)

if V10 and not TRAIN_SENTIMENT:
  timestamps_to_add = train_data_for_selection.fake_v10_timestamp if TIMESTAMP_FAKING else train_data_for_selection.v10_timestamp
    
  selector_input_continuation = selector_input_continuation + TIME_SIDECHANNEL_CHAR + timestamps_to_add
elif V8 and not TRAIN_SENTIMENT:
  selector_input_continuation = selector_input_continuation + TIME_SIDECHANNEL_CHAR + train_data_for_selection.v8_timestamp

if LAST_USER_INPUT_ONLY:
  full = train_data_for_selection.prompt + selector_input_continuation
  full = full.apply(chinese_to_neural_munger)

  selector_input = full.apply(
    lambda s: s[find_control_chars_forumlike(s)[-2:][0][1]:]
  )
elif CONTINUATION_ONLY:
  selector_input = selector_input_continuation.copy()
else:
  selector_input = selector_input_prompt + selector_input_continuation

if CONTINUATION_ONLY and INCLUDE_PROMPT_FINALCHAR:
  if FORUMLIKE:
    full = train_data_for_selection.prompt + selector_input_continuation
    full = full.apply(chinese_to_neural_munger)

    selector_input = full.apply(
      lambda s: s[last_control_char_forumlike(s, incl_number=False,)[1]:]
    )
  else:
    selector_input = train_data_for_selection.prompt_finalchar + selector_input
elif FORUMLIKE and not TRAIN_SENTIMENT and not LAST_USER_INPUT_ONLY:
  selector_input_debug = selector_input
  selector_input = selector_input.apply(
    chinese_to_neural_munger
  )
if FORUMLIKE and not TRAIN_SENTIMENT:
  if V10:
    timestamps_to_add = train_data_for_selection.fake_v10_timestamp if TIMESTAMP_FAKING else train_data_for_selection.v10_timestamp
    prompt_forumlike_in = selector_input_prompt + TIME_SIDECHANNEL_CHAR + timestamps_to_add
  elif V8:
    prompt_forumlike_in = selector_input_prompt + TIME_SIDECHANNEL_CHAR + train_data_for_selection.v8_timestamp
  else:
    prompt_forumlike_in = selector_input_prompt
  prompt_forumlike = prompt_forumlike_in.apply(
    chinese_to_neural_munger_prompt
  )
  if LAST_USER_INPUT_ONLY:
    prompt_forumlike = prompt_forumlike.apply(
      lambda s: s[find_control_chars_forumlike(s)[-2:][0][1]:]
    )
  train_data_for_selection["prompt_forumlike"] = prompt_forumlike  


if FORCE_T_CHAR:
  selector_input = selector_input.apply(lambda s: s if T_CHAR in s else s+T_CHAR)

if NO_TAGS:
  selector_input = selector_input.apply(lambda s: s.partition(T_CHAR)[0].rstrip("\n\ufffa\ufffb ") + s.partition(T_CHAR)[1])
else:
  selector_input.iloc[:] = [strip_uname_tags(s, prompt) for s, prompt in zip(selector_input, train_data_for_selection.prompt)]
  selector_input = selector_input.apply(lambda s: s.rstrip("\n\ufffa\ufffb "))

if TRAIN_SENTIMENT:
  selector_input = selector_input.apply(lambda s: re.sub(r"\<.*?\>", "", s))  # same norm for allen sentiment

if NORMALIZE and ((not FORUMLIKE) or TRAIN_SENTIMENT):
  selector_input = selector_input.apply(normalize_for_generator)
  
if EOT_PREPEND:
  selector_input = EOT_FULL + selector_input

if TRAIN_SENTIMENT:
  selector_input = selector_input.apply(lambda s: enc.decode(enc.encode(s)[:256-1]))

train_data_for_selection["selector_input"] = selector_input

train_data_for_selection["prefix"] = train_data_for_selection.prompt.apply(
    lambda s: [item for item in s.split(UNAME_CHAR) if len(item)>0][0]
    )

# train/test with groups
train_data_for_selection.loc[train_data_for_selection.prompt_finalchar == ORIG_POST_CHAR_CHINESE, "prefix"] = \
"域\n\n" + train_data_for_selection.loc[train_data_for_selection.prompt_finalchar == ORIG_POST_CHAR_CHINESE, "text"]

full_train_data_for_selection = deepcopy(train_data_for_selection)

if TRAIN_SENTIMENT:
  train_data_for_selection, test_data_for_selection = train_test_split(
      train_data_for_selection, test_size=TEST_SIZE,
      stratify=train_data_for_selection.target)  # stratifies on sign of sentiment -- seems OK
else:
  train_data_for_selection = train_data_for_selection[
    train_data_for_selection.prompt_finalchar!="域r"]

  stratifier = train_data_for_selection.prompt_finalchar + train_data_for_selection.target.apply(str)
  train_data_for_selection, test_data_for_selection = train_test_split(train_data_for_selection, stratify=stratifier, test_size=TEST_SIZE)

In [None]:
if not TRAIN_SENTIMENT:
  display(train_data_for_selection.groupby(["prompt_finalchar", "target"]).selector_input.count())
  display(test_data_for_selection.groupby(["prompt_finalchar", "target"]).selector_input.count())

In [None]:
import selector_nn
import selector_estimator

In [None]:
train_data_for_selection["n_tokens"] = train_data_for_selection["selector_input"].apply(lambda s: len(enc.encode(s)))
if not TRAIN_SENTIMENT:
  train_data_for_selection["prompt_end_ntoks"] = train_data_for_selection["prompt_forumlike"].apply(lambda s: len(enc.encode(s)))

train_data_for_selection_final = selector_estimator.reshuffle_batches(train_data_for_selection, batch_size=8)

In [None]:
test_data_for_selection["n_tokens"] = test_data_for_selection["selector_input"].apply(lambda s: len(enc.encode(s)))
if not TRAIN_SENTIMENT:
  test_data_for_selection["prompt_end_ntoks"] = test_data_for_selection["prompt_forumlike"].apply(lambda s: len(enc.encode(s)))

test_data_for_selection_final = selector_estimator.reshuffle_batches(test_data_for_selection, batch_size=8)

In [None]:
print(train_data_for_selection_final.shape)
print(test_data_for_selection_final.shape)

fit

In [None]:
lengths_to_try = [int(train_data_for_selection_final.n_tokens.quantile(q)) for q in [0.95, 0.97, 0.975, 0.98,]] + [825]

lengths_to_try

In [None]:
EXPERIMENTAL = False
EXECUTIVE_DECISIONS_ON = True # !

CALIB_DEFER_TO_EST = True

TRY_SHORT_LENGTH = True
TRY_SHORT_LENGTH_IX = -4

if TRAIN_SENTIMENT:
  SENTIMENT_LOGIT_SUPERVISION = True
  SENTIMENT_2D_LOGIT_SUPERVISION = False
  SENTIMENT_P75 = False

  est_config = {
          "layer_nums": (7, 23),
          "huber": True,
          "show_batch_stats": False,
          "orth_init": True,
          "use_mlp": True,
          "resid_mlp": True,
          "mlp_ratio": 1.,
          "epochs": 3,
          "acti_dropout": 0.15,
          "res_dropout": 0.1,
          "attn_dropout": 0.1,
          "weight_decay": 0.01,
          "base_lr": 5e-5,
          "min_lr_frac": 0.05,
          "warmup_ratio": 0.1,
          "warm_resets": False,
          "grad_clip": 1000.,
          "length": 204,
          "evaluate_during_training": False,
          "supervise_logits": SENTIMENT_LOGIT_SUPERVISION,
          "supervise_only_logit_diff": not SENTIMENT_2D_LOGIT_SUPERVISION,
          "use_only_logit_diff": False,
          "calibrate": False,
          "calibration_split_type": "tts",
          "calibration_val_size": 0.01,
          "init_default_gain": 1,
          "n_head": 32}

else:
  est_config = {"layer_nums": (7, 23 ),
  "use_mlp": True,
  "resid_mlp": True,
  "mlp_ratio": 2,
  "flooding": True,
  "flood_level": 0.0667555,
  "orth_init": True,
  "stop_early": False,
  "warmup_ratio": 0.0318089,
  "epochs": 4,
  "res_dropout": 0.0853267,
  "acti_dropout": 0.279725,
  "attn_dropout": 0.115453,
  "weight_decay": 0.05,
  "base_lr": 2.58815e-5,
  "min_lr_frac": 0.435668,
  "m_mul": 0.667,
  "grad_clip": 1000.,
  "length": 825,
  "supervise_logits": False,
  "supervise_only_logit_diff": False,
  "calibrate": True,
  "calibration_split_type": "ttsp",
  "calibrate_prefixes_separately": False,
  "calibration_val_size": 1/8,
  "use_only_logit_diff": False,
  "init_default_gain": 1.,
  "n_head": 40,
  }

  if EXECUTIVE_DECISIONS_ON:
    est_config['epochs'] = 3 # ! (1/26/21)
    est_config['calibration_val_size'] = 0.1 # ! (1/27/21)

    est_config['mlp_ratio'] = 3
    est_config['acti_dropout'] = 0
    est_config['res_dropout'] = 0.1
    est_config['attn_dropout'] = 0.1
    est_config['weight_decay'] = 0.025

    est_config['min_lr_frac'] = 0.25
    est_config['warmup_ratio'] = 0.05

  if EXPERIMENTAL:
    est_config['additional_full_blocks'] = 1
    est_config['layer_nums'] = (23,)

if TRY_SHORT_LENGTH and not TRAIN_SENTIMENT:
  est_config['length'] = lengths_to_try[TRY_SHORT_LENGTH_IX]
  print(f"trying length: {est_config['length']}\n")
  print("cases clipped w/ this length")
  display((train_data_for_selection.n_tokens > est_config['length']).agg(['sum', 'mean']))
  print(f"\nn_head: {est_config['n_head']}")

In [None]:
import tflex

est=selector_estimator.SelectorEstimatorFromCkpt(
    cleanup_on_exception=False,
    persist_variables=False,
    ckpt=tflex.latest_checkpoint(ckpt_dir),
    enc=enc,
    selection_tok=enc.encode("<|endoftext|>")[-1],
    **est_config
    )

if TRAIN_SENTIMENT and SENTIMENT_LOGIT_SUPERVISION:
  avg_loss_beta = 0.995
  if True:#SENTIMENT_LOGIT_SUPERVISION:
    final_fit_X = train_data_for_selection_final.drop(["neg_logit", "pos_logit"], axis=1)
    final_fit_target = train_data_for_selection_final[["neg_logit", "pos_logit"]]
  else:
    final_fit_X = train_data_for_selection_final.drop("logit_diff", axis=1)
    final_fit_target = train_data_for_selection_final.logit_diff
else:
  avg_loss_beta = 0.99
  final_fit_X = train_data_for_selection_final.drop("target", axis=1)
  final_fit_target = train_data_for_selection_final.target

try:
  est.cleanup()
except:
  pass

In [None]:
est.fit(final_fit_X, final_fit_target, avg_loss_beta=avg_loss_beta)

Evaluate

(This is very old code and could be cleaned up)

In [None]:
sess = est.session_
selection_step_train = est.selection_step_train_
selection_step_eval = est.selection_step_eval_

In [None]:
lr = est.lr_
opt = est.opt_

opt_apply = est.opt_apply_
select_loss = est.select_loss_

select_logits_train = est.select_logits_train_
select_logits_eval = est.select_logits_eval_

if TRAIN_SENTIMENT and SENTIMENT_LOGIT_SUPERVISION and not SENTIMENT_2D_LOGIT_SUPERVISION:
  select_target = est.select_target_logit_diff_
else:
  select_target = est.select_target_

context_for_h = est.context_for_h_
global_step = est.global_step_

In [None]:
import scipy.special

def predict_select(data_batch, threshold=0.5, truncate_at_right=TRUNCATE_AT_RIGHT,
                   ):
  if len(data_batch) != batch_size_for_h:
    raise ValueError("badlength")
  feed_dict = est._feed_from_batch(data_batch, scope=est.select_scope_eval_)

  with sess.as_default():
    logits = sess.run(select_logits_eval, feed_dict=feed_dict)

  probs = scipy.special.softmax(logits, axis=1)
  results = {"logits": logits, "probs": probs}
  results["preds"] = probs[:, 1]>threshold
  return results

In [None]:
import time 

def eval_selection(data, steps=None, start_ix=0, silent=False, truncate_at_right=TRUNCATE_AT_RIGHT):
  all_preds = []
  all_probs = []
  all_target_logits = []
  all_targets = []
  all_logits = []
  all_row_ix = []

  if steps is None:
    steps = len(data)//est.batch_size

  row_ix = start_ix*est.batch_size
  
  step_iter = tqdm(list(range(start_ix, steps))) if silent else range(start_ix, steps)
  for step_ix in step_iter:
    data_batch = data.iloc[row_ix:row_ix + est.batch_size, :]
    
    t1 = time.time()
    try:
      results_batch = est._predict_select(data_batch, disable_calibration=False)
    except Exception as e:
      if not silent:
        print(f"skipping batch ({e})")
      continue
    t2 = time.time()
    tdiff = t2 - t1

    if TRAIN_SENTIMENT:
      all_target_logits.extend(data_batch[["neg_logit", "pos_logit"]].values)
      all_targets.extend((data_batch.pos_logit > data_batch.neg_logit).astype(int))
    
    else:
      all_targets.extend(data_batch.target.values)
    all_preds.extend(results_batch["preds"])
    all_probs.extend(results_batch["probs"])
    all_logits.extend(results_batch["logits"])
    all_row_ix.extend(list(range(row_ix, row_ix + est.batch_size)))

    accs = np.array(all_preds) == np.array(all_targets)
    avg_acc = accs.mean()

    tp = (np.array(all_targets)>0).sum()
    pp = (np.array(all_preds)>0).sum()

    assert len(all_targets) == len(all_preds)

    if not silent:
      print(f"{step_ix}/{steps} | {tdiff:.2f}s | acc={avg_acc:.4f} | {tp}/{len(all_targets)} true pos | {pp}/{len(all_targets)} pred pos")
      print("\n--------------\n")

    row_ix += est.batch_size

  all_probs = np.stack(all_probs)[:, 1]
  return all_preds, all_probs, all_targets, all_logits, all_row_ix, all_target_logits

In [None]:
all_preds, all_probs, all_targets, all_logits, all_row_ix, all_target_logits = eval_selection(
    test_data_for_selection_final, start_ix=0
    )

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

all_logits, all_target_logits = np.array(all_logits), np.array(all_target_logits)
all_targets = np.asarray(all_targets)
all_preds = np.asarray(all_preds)

if TRAIN_SENTIMENT:
  all_logit_sums = (all_logits[:, 1] + all_logits[:, 0])
  all_target_logit_sums = (all_target_logits[:, 1] + all_target_logits[:, 0])

  all_logit_diffs = (all_logits[:, 1] - all_logits[:, 0])
  all_target_logit_diffs = (all_target_logits[:, 1] - all_target_logits[:, 0])

  print(((all_logits[:, 1] > all_logits[:, 0]) == (all_target_logits[:, 1] > all_target_logits[:, 0])).mean())

  print(((all_target_logits - all_logits)**2).mean())

  display(pd.Series((all_target_logit_diffs - all_logit_diffs)**2).describe())

  display(pd.Series(all_target_logit_diffs - all_logit_diffs).describe())

  FIGSIZE = (16, 6)
  MS = 3
  ALPHA = 0.5

  fig, axes = plt.subplots(1, 2, figsize=FIGSIZE)
  axes[0].scatter(all_target_logit_diffs, all_logit_diffs, s=MS, alpha=ALPHA)

  axes[0].plot(
      [all_target_logit_diffs.min(), all_target_logit_diffs.max()],
      [all_target_logit_diffs.min(), all_target_logit_diffs.max()],
      c='r', ls='--'
  )

  axes[0].axhline(0, c='g', ls='--')
  axes[0].axvline(0, c='g', ls='--')
  axes[0].set_title("logit diff")

  axes[1].scatter(all_target_logit_sums, all_logit_sums, s=MS, alpha=ALPHA)
  axes[1].plot(
      [all_target_logit_sums.min(), all_target_logit_sums.max()],
      [all_target_logit_sums.min(), all_target_logit_sums.max()],
      c='r', ls='--'
  )
  axes[1].set_title("logit sum")

  plt.tight_layout();

  fig, axes = plt.subplots(1, 2, figsize=FIGSIZE)

  axes[0].scatter(all_target_logits[:, 0], all_logits[:, 0], s=MS, alpha=ALPHA)

  axes[0].plot(
      [all_target_logits[:, 0].min(), all_target_logits[:, 0].max()],
      [all_target_logits[:, 0].min(), all_target_logits[:, 0].max()],
      c='r', ls='--'
  )
  axes[0].set_title("neg logit")

  axes[1].scatter(all_target_logits[:, 1], all_logits[:, 1], s=MS, alpha=ALPHA)

  axes[1].plot(
      [all_target_logits[:, 1].min(), all_target_logits[:, 1].max()],
      [all_target_logits[:, 1].min(), all_target_logits[:, 1].max()],
      c='r', ls='--'
  )
  axes[1].set_title("pos logit")

  plt.tight_layout();

  review_data = test_data_for_selection_final.iloc[:len(all_probs)].copy()
  review_data["logit_diff"] = all_target_logit_diffs
  review_data["logit_diff_pred"] = all_logit_diffs

  review_data["err"] = review_data.logit_diff - review_data.logit_diff_pred
  review_data["sq_err"] = review_data.err.values**2

  with pd.option_context("display.max_rows", 999, "display.max_colwidth", 200):
    display(review_data.sort_values("sq_err", ascending=False)[["selector_input", "logit_diff", "logit_diff_pred", "err", "sq_err"]].iloc[:20, :])

  with pd.option_context("display.max_rows", 999, "display.max_colwidth", 200):
    display(review_data.sort_values("sq_err", ascending=True)[["selector_input", "logit_diff", "logit_diff_pred", "err", "sq_err"]].iloc[:20, :])

In [None]:
# new
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(2, 2, figsize=(20, 12))

bins=np.linspace(0., 1., 7)

ax[0, 0].hist(all_probs[test_data_for_selection_final.prompt_finalchar.iloc[:len(all_probs)]!=ORIG_POST_CHAR_CHINESE], bins=bins, alpha=0.5, density=False, edgecolor='k')
ax[0, 0].set_xticks(bins)
ax[0, 0].set_title("resp, raw")

_, bins, _ = ax[0, 1].hist(all_probs[(test_data_for_selection_final.target.iloc[:len(all_probs)] < 0.5) & 
                                     (test_data_for_selection_final.prompt_finalchar.iloc[:len(all_probs)]!=ORIG_POST_CHAR_CHINESE)], bins=bins, alpha=0.5, density=False, edgecolor='k')
ax[0, 1].hist(all_probs[(test_data_for_selection_final.target.iloc[:len(all_probs)] > 0.5) &
                       (test_data_for_selection_final.prompt_finalchar.iloc[:len(all_probs)]!=ORIG_POST_CHAR_CHINESE)], bins=bins, alpha=0.5, density=False, edgecolor='k');
ax[0, 1].set_xticks(bins);
ax[0, 1].set_title("resp, raw")

# orig
ax[1, 0].hist(all_probs[test_data_for_selection_final.prompt_finalchar.iloc[:len(all_probs)]==ORIG_POST_CHAR_CHINESE], bins=bins, alpha=0.5, density=False, edgecolor='k')
ax[1, 0].set_xticks(bins)
ax[1, 0].set_title("orig, raw")

_, bins, _ = ax[1, 1].hist(all_probs[(test_data_for_selection_final.target.iloc[:len(all_probs)] < 0.5) & 
                                     (test_data_for_selection_final.prompt_finalchar.iloc[:len(all_probs)]==ORIG_POST_CHAR_CHINESE)], bins=bins, alpha=0.5, density=False, edgecolor='k')
ax[1, 1].hist(all_probs[(test_data_for_selection_final.target.iloc[:len(all_probs)] > 0.5) &
                       (test_data_for_selection_final.prompt_finalchar.iloc[:len(all_probs)]==ORIG_POST_CHAR_CHINESE)], bins=bins, alpha=0.5, density=False, edgecolor='k');
ax[1, 1].set_xticks(bins);
ax[1, 1].set_title("orig, raw");

In [None]:
# new
from sklearn.metrics import brier_score_loss, average_precision_score, accuracy_score, log_loss, confusion_matrix

review_data = test_data_for_selection_final.iloc[:len(all_probs)].copy()
review_data["prob"] = all_probs
review_data["neg_logits"] = [l[0] for l in all_logits]
review_data["pos_logits"] = [l[1] for l in all_logits]
review_data["logit_diffs"] = review_data.pos_logits - review_data.neg_logits

err_size = (review_data.prob - review_data.target).apply(np.abs)
review_data_ = deepcopy(review_data)
review_data_["err_size"] = err_size

review_data_["is_orig"] = review_data_.prompt_finalchar==ORIG_POST_CHAR_CHINESE

for pfc, g in review_data_.groupby("prompt_finalchar"):
  print(f"pfc={pfc} ({len(g)}/{len(review_data_)})")
  baserate_acc = max(g.target.mean(), 1.-g.target.mean())
  baserate_brier = brier_score_loss(g.target, [g.target.mean() for _ in range(len(g.target))])
  baserate_AP = average_precision_score(g.target, [g.target.mean() for _ in range(len(g.target))])

  print(f"acc:   {accuracy_score(g.target, g.prob>0.5):.3f} (vs {baserate_acc:.3f})")
  print(f"brier: {brier_score_loss(g.target, g.prob):.3f} (vs {baserate_brier:.3f})")
  print(f"AP:    {average_precision_score(g.target, g.prob):.3f} (vs {baserate_AP:.3f})")
  print()

In [None]:
# new 
from sklearn.metrics import brier_score_loss, average_precision_score, accuracy_score, log_loss, confusion_matrix

baserate_acc = max(test_data_for_selection_final.target.mean(), 1.-test_data_for_selection_final.target.mean())
baserate_brier = brier_score_loss(test_data_for_selection_final.target, [test_data_for_selection_final.target.mean() for _ in range(len(test_data_for_selection_final.target))])
baserate_AP = average_precision_score(test_data_for_selection_final.target, [test_data_for_selection_final.target.mean() for _ in range(len(test_data_for_selection_final.target))])

print(f"acc:   {accuracy_score(all_targets, all_preds):.3f} (vs {baserate_acc:.3f})")
print(f"brier: {brier_score_loss(all_targets, all_probs):.3f} (vs {baserate_brier:.3f})")
print(f"AP:    {average_precision_score(all_targets, all_probs):.3f} (vs {baserate_AP:.3f})")

save

In [None]:
majorverson = "v10"

if TRAIN_SENTIMENT:
  minorversion="v2"  # manually updated each re-train
else:
  minorversion="v8"  # manually updated each re-train

if TRAIN_SENTIMENT:
  save_path = f"sentiment/{majorverson}/{minorversion}/"
  gs_path = f"gs://{BUCKET_NAME}/ar_model_v10/v10_sentiment/"
else:
   save_path = f"selector/{majorverson}/{minorversion}/"
   gs_path = f"gs://{BUCKET_NAME}/ar_model_v10/v10_selector/"

In [None]:
if not os.path.exists(save_path):
  os.makedirs(save_path, exist_ok=True)
else:
  print(f"warning: save_path already exists")

print(save_path)

In [None]:
est.save(save_path)

In [None]:
%ls -lha {save_path}

In [None]:
!gsutil -m cp -R {save_path} {gs_path}