In [None]:
from fastai.text import *
import html
import json

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"
torch.cuda.set_device(1)

In [None]:
data_root_path = '/home/users2/dayaniey/mardy/eday/semeval19-hyperpartisan/data'
PATH=Path(data_root_path)

In [None]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [None]:
CLAS_PATH=Path('{}/semeval_clf/'.format(data_root_path))
CLAS_PATH.mkdir(exist_ok=True)

LM_PATH=Path('{}/semeval_lm/'.format(data_root_path))
LM_PATH.mkdir(exist_ok=True)

In [None]:
HP_CLASSES = ['True', 'False']
B_CLASSES  = ['right','left','least','right-center','left-center']

In [None]:
def filter_out(sentences):
    return sentences

In [None]:
def _write_for_classification(texts,hps,bis,aids,col_names,out_csv_file):
    df = pd.DataFrame({'text':np.array(texts), 'hyperpartisan':np.array(hps),
                       'bias':np.array(bis),'article_id':np.array(aids)},columns=col_names)
    df.to_csv(CLAS_PATH/out_csv_file,mode='a',header=False, index=False)

def _write_for_lm(p,texts,hps,bis,aids,col_names,out_csv_file):
    trn_name,val_name = out_csv_file
    df = pd.DataFrame({'text':np.array(texts),'labels':np.array(["0"]*len(texts)),'article_id':np.array(aids)})
    if random.random() > p:
        df.to_csv(LM_PATH/val_name,mode='a',header=False, index=False)
    else:
        df.to_csv(LM_PATH/trn_name,mode='a',header=False, index=False)
        
def write_to_csv(path,path_label,out_csv_file,is_clf,p=0.85):
    col_names_clf = ['text','hyperpartisan','bias','article_id']
    col_names_lm = ['text','fake_label','article_id']
    texts,aids,pubs,hps,bis,urls = [],[],[],[],[],[]
    with open(path) as fin,open(path_label) as fin2:
        for instance_json, label_json in zip(fin, fin2):
            instance       = json.loads(instance_json)
            sentences      = filter_out(instance['ps'])
            text           = " ".join(sentences)
            article_id     = instance['article_id']
            published_at   = instance['published_at']                
            # label file 
            instance_label   = json.loads(label_json)
            label_article_id = instance_label['article_id']
            label            = instance_label['hyperpartisan']
            bias             = instance_label['bias']
            url              = instance_label['url']   
            if label_article_id != article_id:
                print("######## ERROR ########:",article_id," ",label_article_id)
            texts.append(text)
            aids.append(article_id)
            pubs.append(published_at)
            hps.append(label)
            bis.append(bias)
            urls.append(url)
            if len(texts) == 200: # (200 = Bucket Size, nothing important just to speed up process)
                if is_clf:
                    _write_for_classification(texts,hps,bis,aids,col_names_clf,out_csv_file)
                else:
                    _write_for_lm(p,texts,hps,bis,aids,col_names_lm,out_csv_file)
                texts,aids,pubs,hps,bis,urls = [],[],[],[],[],[]                
        if len(texts) != 0:
            if is_clf:
                _write_for_classification(texts,hps,bis,aids,col_names_clf,out_csv_file)
            else:
                _write_for_lm(p,texts,hps,bis,aids,col_names_lm,out_csv_file)

In [None]:
# Prepare csv files for Classification 
write_to_csv(PATH/'train.json',PATH/'train_label.json',CLAS_PATH/'train.csv',True)
write_to_csv(PATH/'val.json',PATH/'val_label.json',CLAS_PATH/'val.csv',True)

In [None]:
# Prepare csv files for LM model 
write_to_csv(PATH/'train.json',PATH/'train_label.json',(LM_PATH/'train.csv',LM_PATH/'val.csv'),False)

In [None]:
## Language Model Tokens 
chunksize=24000

re1 = re.compile(r'  +')
def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

def get_texts(df, n_lbls):
    #labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    labels = df.iloc[:,1].values.astype(np.int64)
    texts = f'\n{BOS} {FLD} 1 ' + df[0].astype(str)
    #for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    texts = list(texts.apply(fixup).values)

    tok = Tokenizer().proc_all_mp(partition_by_cores(texts))
    return tok, list(labels)

def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
    return tok, labels


In [None]:
df_trn = pd.read_csv(LM_PATH/'train.csv', header=None,chunksize=chunksize)
df_val = pd.read_csv(LM_PATH/'val.csv', header=None,chunksize=chunksize)

In [None]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

In [None]:
(LM_PATH/'tmp').mkdir(exist_ok=True)

In [None]:
np.save(LM_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)

In [None]:
tok_trn = np.load(LM_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(LM_PATH/'tmp'/'tok_val.npy')

In [None]:
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

In [None]:
max_vocab = 60000
min_freq = 2

In [None]:
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [None]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

In [None]:
print(tok_trn[0])

In [None]:
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [None]:
np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)
np.save(LM_PATH/'tmp'/'val_ids.npy', val_lm)
pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))

In [None]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))

In [None]:
vs=len(itos)

In [None]:
# Reduce the number of instances 
trn_lm = trn_lm[:90000] 
val_lm = val_lm[:10000]

In [None]:
# ! wget -nH -r -np -P {PATH} http://files.fast.ai/models/wt103/

In [None]:
em_sz,nh,nl = 400,1150,3

In [None]:
PRE_PATH = PATH/'models'/'wt103'
PRE_LM_PATH = PRE_PATH/'fwd_wt103.h5'

In [None]:
wgts = torch.load(PRE_LM_PATH, map_location=lambda storage, loc: storage)

In [None]:
enc_wgts = to_np(wgts['0.encoder.weight'])
row_m = enc_wgts.mean(0) # average embedding vector for items not exist in wikipedia 

In [None]:
# Load wikipedia dictionary
itos2 = pickle.load((PRE_PATH/'itos_wt103.pkl').open('rb'))
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

In [None]:
# and match dictionaries ( Hyperpartisan and Wikipedia)
new_w = np.zeros((vs, em_sz), dtype=np.float32)
for i,w in enumerate(itos):
    r = stoi2[w]
    new_w[i] = enc_wgts[r] if r>=0 else row_m

In [None]:
wgts['0.encoder.weight'] = T(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w)) # Final prediction to back in word (output embedding)

In [None]:
# LANGUAGE MODEL pretrained on wikipedia

In [None]:
wd=1e-7
bptt=70
bs=52 # It was 52 
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))

In [None]:
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)

In [None]:
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [None]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7 

In [None]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])

learner.metrics = [accuracy]
learner.freeze_to(-1) # It is unfreeze in video but in the ipynb it is like this (maybe they are same) ? 

In [None]:
learner.get_layer_groups()

In [None]:
learner.model.load_state_dict(wgts)

In [None]:
lr=1e-3
lrs = lr

In [None]:
learner.fit(lrs/2, 1, wds=wd, use_clr=(32,2), cycle_len=1)

In [None]:
learner.save('lm_last_ft')

In [None]:
learner.load('lm_last_ft')

In [None]:
learner.unfreeze()

In [None]:
learner.lr_find(start_lr=lrs/10, end_lr=lrs*10, linear=True)

In [None]:
learner.sched.plot()

In [None]:
learner.fit(lrs, 1, wds=wd, use_clr=(20,10), cycle_len=8)

In [None]:
learner.save('lm1')

In [None]:
learner.save_encoder('lm1_enc')

In [None]:
learner.load('lm1')

In [None]:
learner.load_encoder('lm1_enc')

In [None]:
learner.sched.plot_loss()

In [None]:
# CLASSIFIER TOKENS

In [None]:
df_trn = pd.read_csv(CLAS_PATH/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(CLAS_PATH/'val.csv', header=None, chunksize=chunksize)

In [None]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

In [None]:
(CLAS_PATH/'tmp').mkdir(exist_ok=True)

np.save(CLAS_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(CLAS_PATH/'tmp'/'tok_val.npy', tok_val)

np.save(CLAS_PATH/'tmp'/'trn_labels.npy', trn_labels)
np.save(CLAS_PATH/'tmp'/'val_labels.npy', val_labels)

In [None]:
tok_trn = np.load(CLAS_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(CLAS_PATH/'tmp'/'tok_val.npy')

In [None]:
# Load dictionary from Language model (fine-tuned Hyperpatisan dictionary).
itos = pickle.load((LM_PATH/'tmp'/'itos.pkl').open('rb'))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

In [None]:
# String to Integer indices
trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])
val_clas = np.array([[stoi[o] for o in p] for p in tok_val])

In [None]:
# Save classification tokens
np.save(CLAS_PATH/'tmp'/'trn_ids.npy', trn_clas)
np.save(CLAS_PATH/'tmp'/'val_ids.npy', val_clas)

In [None]:
# CLASSIFIER MODEL

In [None]:
# Load data indices
trn_clas = np.load(CLAS_PATH/'tmp'/'trn_ids.npy')
val_clas = np.load(CLAS_PATH/'tmp'/'val_ids.npy')

In [None]:
# Load labels
trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'trn_labels.npy'))
val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'val_labels.npy'))

In [None]:
# Hyperparameters
bptt,em_sz,nh,nl = 70,400,1150,3
vs = len(itos)
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
bs = 48 # It was 48

In [None]:
# Make sure labels start from 0
min_lbl = trn_labels.min()
trn_labels -= min_lbl
val_labels -= min_lbl
c=int(trn_labels.max())+1

In [None]:
# Pad instances to make each instance equal length in the batch
trn_ds = TextDataset(trn_clas, trn_labels)
val_ds = TextDataset(val_clas, val_labels)
trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)
val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))
trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)
md = ModelData(PATH, trn_dl, val_dl)

In [None]:
dps = np.array([0.4,0.5,0.05,0.3,0.4])*0.4

In [None]:
m = get_rnn_classifier(bptt, 20*70, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
          layers=[em_sz*3, 50, c], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])

In [None]:
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

In [None]:
learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learn.clip=.25
learn.metrics = [accuracy]

In [None]:
# Different Learning rate values for different layers
lr=3e-3
lrm = 2.6
lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])

In [None]:
lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2]) # TODO: If this line exist, should we remove the above ? 

In [None]:
wd = 0 # Weight Decay
learn.load_encoder('lm1_enc')

In [None]:
learn.freeze_to(-1) # Unfreeze last layer

In [None]:
learn.lr_find(lrs/1000)
learn.sched.plot()

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(40,10)) # use_clr was 8,3

In [None]:
# Save and re-load the class_0, which mean ? 
learn.save('clas_0')

In [None]:
learn.load('clas_0')

In [None]:
learn.freeze_to(-2) # Unfreeze one more layer

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(40,10)) # use_clr was 8,3

In [None]:
# Save and re-load the class_1, which mean ? 
learn.save('clas_1')

In [None]:
learn.load('clas_1')

In [None]:
learn.unfreeze()

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(40,10)) # Cycle_len was 14, I set it to 1 but it should be changed. 

In [None]:
learn.sched.plot_loss()

In [None]:
learn.save('clas_2')