In [86]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from fastai.text import * 
from fastai.callbacks import *
from pytorch_pretrained_bert.modeling import BertConfig, BertForSequenceClassification
from pytorch_pretrained_bert import BertTokenizer
from shutil import copyfile
from sklearn.model_selection import train_test_split


# nltk for preprocessing of text data
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

# sklearn for preprocessing and machine learning models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier,VotingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier 

import warnings
warnings.filterwarnings("ignore")

In [82]:
filepath = Path('../input/')
daf = pd.read_csv(filepath/'train_2.csv')
daf = daf.fillna('be well')
testse = pd.read_csv(filepath/'test_2.csv')

daf['text'] = daf['text']+ ' ' +daf['drug']
testse['text'] = testse['text']+ ' ' +testse['drug']
testse2 = testse.copy()

df = daf[['drug','text','sentiment']]
tests = testse[['drug','text']]

In [4]:
df['word_count'] = df['text'].str.split().map(len)
q = df[(df['word_count']>=200)]

In [5]:
tests['word_count'] = tests['text'].str.split().map(len)
p = tests[(tests['word_count']>=200)]

In [6]:
tests.drop(['word_count','drug'],axis=1,inplace=True)
df.drop(['word_count','drug'],axis=1,inplace=True)

In [7]:
def useful(n):
    m =[]
    
    totals = list(n.index)
    #print(totals)
    for total in totals:
        l = []
        
        t = n.text[total]
        d = n.drug[total]
        #print(d)
    
        v = t.split(".")
    
        for i in range(0,len(v)):
            if(v[i].find(d)!=(-1)):
                l.append(i)
    
        v = v[l[0]:len(v)]
        s = '.'.join(v)
        m.append(s)

    return m        

In [8]:
q['text'] = useful(q)
p['text'] = useful(p)

In [9]:
q.drop(['drug','sentiment','word_count'],axis=1,inplace=True)
p.drop(['drug','word_count'],axis=1,inplace=True)

In [10]:
df = pd.merge(df,q,right_index=True,left_index=True,how='outer')
tests = pd.merge(tests,p,right_index=True,left_index=True,how='outer')

In [11]:
df = df.fillna('')
df['text'] = df['text_x']+df['text_y'] 

df.drop(['text_x','text_y'],axis=1,inplace=True)
df2 = df.copy()

tests = tests.fillna('')
tests['text'] = tests['text_x']+tests['text_y'] 

tests.drop(['text_x','text_y'],axis=1,inplace=True)

In [None]:
import gc
gc.collect()

In [12]:
path = Path(os.path.abspath(os.curdir))

In [13]:
df['neutral'] = df['sentiment'].apply(lambda x: 1. if x==2 else 0.)
df['positive'] = df['sentiment'].apply(lambda x: 1. if x==0 else 0.)
df['negative'] = df['sentiment'].apply(lambda x: 1. if x==1 else 0.)

In [14]:
df['word_count'] = df['text'].str.split().map(len)
g = df[(df['word_count']>=400) & (df['sentiment']==2)]
df = pd.concat([df, g]).drop_duplicates(keep=False)

df.drop(['sentiment','word_count'],axis=1,inplace=True)
len(df)

4142

In [15]:
train = df[:int(len(df)*.999)]
valid = df[int(len(df)*.999):]

train.to_csv('train.csv',index_label=False )
valid.to_csv('valid.csv',index_label=False )

In [16]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

config = Config(
    testing=False,
    bert_model_name="bert-large-uncased",
    max_lr=3e-5,
    epochs=4,
    use_fp16=True,
    bs=16,
    discriminative=False,
    max_seq_len=256,
)

In [17]:
bert_tok = BertTokenizer.from_pretrained(config.bert_model_name)

100%|██████████| 231508/231508 [00:00<00:00, 921841.16B/s]


In [18]:
class FastAiBertTokenizer(BaseTokenizer): 
    """Wrapper around BertTokenizer to be compatible with fast.ai"""
    def __init__(self, tokenizer: BertTokenizer, max_seq_len: int=128, **kwargs): 
         self._pretrained_tokenizer = tokenizer 
         self.max_seq_len = max_seq_len 
    def __call__(self, *args, **kwargs): 
         return self 
    def tokenizer(self, t:str) -> List[str]: #Limits the maximum sequence length
        return ["[CLS]"] + self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2] + ["[SEP]"] 

In [19]:
fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), pre_rules=[], post_rules=[])

In [20]:
def _join_texts(texts:Collection[str], mark_fields:bool=False, sos_token:Optional[str]=BOS):
    """Borrowed from fast.ai source"""
    if not isinstance(texts, np.ndarray): texts = np.array(texts)
    if is1d(texts): texts = texts[:,None]
    df = pd.DataFrame({i:texts[:,i] for i in range(texts.shape[1])})
    text_col = f'{FLD} {1} ' + df[0].astype(str) if mark_fields else df[0].astype(str)
    if sos_token is not None: text_col = f"{sos_token} " + text_col
    for i in range(1,len(df.columns)):
        #text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i]
        text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i].astype(str)
    return text_col.values

In [21]:
train, val = [pd.read_csv(path / fname) for fname in ["train.csv", "valid.csv"]]
test = tests

In [22]:
if config.testing:
    train = train.head(64)
    val = val.head(64)
    test = test.head(64)

In [23]:
fastai_bert_vocab = Vocab(list(bert_tok.vocab.keys()))


In [24]:
fastai_tokenizer = Tokenizer(tok_func=FastAiBertTokenizer(bert_tok, max_seq_len=config.max_seq_len), 
                             pre_rules=[], post_rules=[])

In [25]:
label_cols = ["negative", "neutral", "positive"]


In [26]:
databunch = TextDataBunch.from_df(".", train, val, test,
                   tokenizer=fastai_tokenizer,
                   vocab=fastai_bert_vocab,
                   include_bos=False,
                   include_eos=False,
                   text_cols="text",
                   label_cols=label_cols,
                   bs=config.bs,
                   collate_fn=partial(pad_collate, pad_first=False, pad_idx=0),
              )

In [27]:
class BertTokenizeProcessor(TokenizeProcessor):
    def __init__(self, tokenizer):
        super().__init__(tokenizer=tokenizer, include_bos=False, include_eos=False)

class BertNumericalizeProcessor(NumericalizeProcessor):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, vocab=Vocab(list(bert_tok.vocab.keys())), **kwargs)

def get_bert_processor(tokenizer:Tokenizer=None, vocab:Vocab=None):
    """
    Constructing preprocessors for BERT
    We remove sos/eos tokens since we add that ourselves in the tokenizer.
    We also use a custom vocabulary to match the numericalization with the original BERT model.
    """
    return [BertTokenizeProcessor(tokenizer=tokenizer),
            NumericalizeProcessor(vocab=vocab)]

class BertDataBunch(TextDataBunch):
    @classmethod
    def from_df(cls, path:PathOrStr, train_df:DataFrame, valid_df:DataFrame, test_df:Optional[DataFrame]=None,
                tokenizer:Tokenizer=None, vocab:Vocab=None, classes:Collection[str]=None, text_cols:IntsOrStrs=1,
                label_cols:IntsOrStrs=0, label_delim:str=None, **kwargs) -> DataBunch:
        "Create a `TextDataBunch` from DataFrames."
        p_kwargs, kwargs = split_kwargs_by_func(kwargs, get_bert_processor)
        # use our custom processors while taking tokenizer and vocab as kwargs
        processor = get_bert_processor(tokenizer=tokenizer, vocab=vocab, **p_kwargs)
        if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
        src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
                        TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
        src = src.label_for_lm() if cls==TextLMDataBunch else src.label_from_df(cols=label_cols, classes=classes)
        if test_df is not None: src.add_test(TextList.from_df(test_df, path, cols=text_cols))
        return src.databunch(**kwargs)

In [28]:
bert_model = BertForSequenceClassification.from_pretrained(config.bert_model_name, num_labels=3)

100%|██████████| 1248501532/1248501532 [00:48<00:00, 25616258.17B/s]


In [29]:
loss_func = nn.BCEWithLogitsLoss()
from fastai.callbacks import CSVLogger

In [30]:
learner = Learner(databunch, bert_model,loss_func=loss_func, callback_fns=[partial(CSVLogger, append=True)])
if config.use_fp16: learner = learner.to_fp16()

In [31]:
learner.fit_one_cycle(4, 3e-5)

epoch,train_loss,valid_loss,time
0,0.507025,0.609925,03:53
1,0.491021,0.517374,04:24
2,0.42864,0.658327,04:06
3,0.323034,0.436515,04:30


In [32]:
def get_preds_as_nparray(ds_type) -> np.ndarray:
    """
    the get_preds method does not yield the elements in order by default
    we borrow the code from the RNNLearner to resort the elements into their correct order
    """
    preds = learner.get_preds(ds_type)[0].detach().cpu().numpy()
    #y = learner.get_preds(ds_type)[1].detach().cpu().numpy()
    
    sampler = [i for i in databunch.dl(ds_type).sampler]
    reverse_sampler = np.argsort(sampler)
    
    return preds[reverse_sampler, :] 

In [33]:
preds = get_preds_as_nparray(DatasetType.Test)

In [35]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [36]:
idx = np.argmax(preds, axis=-1)
y_preds = np.zeros(preds.shape)
y_preds[np.arange(preds.shape[0]), idx] = 1

In [37]:
dataset = pd.DataFrame({1:preds[:,0],2:preds[:,1],0:preds[:,2]})
s = pd.get_dummies(dataset).idxmax(1)
dataseti = pd.DataFrame({'sentiment':s})

In [38]:
final = pd.merge(testse,dataseti,left_index=True,right_index=True,how='inner')
final.drop(['drug','text'],inplace=True,axis=1)
final['sentiment'].value_counts()

2    1868
0     552
1     504
Name: sentiment, dtype: int64