# A simple pipeline for training QA models using simpletransformers wrapper around transformers library implemented by HuggingFace

In [1]:
import torch
import typing
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
import simpletransformers
from simpletransformers.question_answering import QuestionAnsweringModel

In [4]:
data = pd.read_csv("train.csv")

In [5]:
data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [40]:
?simpletransformers.question_answering.QuestionAnsweringModel

In [80]:
class QAModel(simpletransformers.question_answering.QuestionAnsweringModel):
    """
    A simple wrapper around the QuestionAnsweringModel class from simpletransformers.
    By default, it is set to using CPU, not GPU; set use_cuda=True in class initialization to force it to use GPU.
    """
    def __init__(self, model_name="distilbert",model_path="distilbert-base-uncased-distilled-squad",use_cuda=True, **args):
        """
        ::param model_name - type of model to use
        ::param model_path - path to pretrained model directory or standard 
                        model specification from here https://huggingface.co/transformers/pretrained_models.html 
                        or here https://huggingface.co/models
        ""param use_cuda - whether to use GPUs or not
        ::param args - specifies args passed to simpletransformers model. Take a look here https://simpletransformers.ai/docs/usage/
        """
        super(QAModel, self).__init__(model_name,model_path, use_cuda=use_cuda,  args=args)
        #self.model_type = model_name
        #self.model_path = model_path
        #self.args=args
        self.skipped = [] # indices of examples that did not fit some conditions on data preparation during .fit() call
    
    
    def fit(self, data:typing.Union[pd.core.frame.DataFrame, pd.core.series.Series], fields: typing.Union[pd.core.indexes.base.Index, np.ndarray, list], context_for_all: typing.Union[str] = "", question_for_all: typing.Union[str] = "") -> None:
        """
        ::param data - containing questions/answers/contexts or questions/answers or answers
        ::param fields -  specifies the names of columns with questions, answers and contexts in df, 
                            passed as strings in this order.if context of all questions is the same, pass in dataframe with only two columns and specify the context_for_all variable.
        ::param context_for_all - should be specified if all questions are asked to the same context
        ::param question_for_all - should be specified if one question for all training examples is specified
        """
        if context_for_all != "" and question_for_all != "":
            raise NotImplementedError
        self.skipped = [] # free space; 
        
        training_data = []
        for i  in range(len(data)):
            if context_for_all != "" and question_for_all == "":
                context = context_for_all
                question = data.loc[i, fields[0]]
                answer = data.loc[i, fields[1]]
            elif context_for_all == "" and question_for_all != "":
                context = fields[1]
                question = question_for_all
                answer = df[i]
            else:
                context = data.loc[i, fields[2]]
                question = data.loc[i, fields[0]]
                answer = data.loc[i, fields[1]]

            qas = []
           
                    
            answers = []
            
            if type(answer) != str or type(context) != str or type(question) != str:
                self.skipped.append(i)
                continue
            answer_starts = self._find_all(context, answer)
            for answer_start in answer_starts:
                answers.append({'answer_start': answer_start, 'text': answer.lower()})
                break
            qas.append({'question': question, "id":hash(question + str(np.random.random())), 'is_impossible': False, 'answers': answers})

            training_data.append({'context': context.lower(), 'qas': qas})
            
        #train_args = {'silent':True, 'evaluate_during_training':False, 'output_dir':"outputs/", 'no_cache':True,'cache_dir':"cache", "model_type":self.model_type}
            
        self.train_model(training_data)
        
        
    def make_prediction(self, test: typing.Union[pd.core.frame.DataFrame]) -> np.ndarray:
        output = []
            
        for i in range(len(test)):
            context = test.iloc[i, 1]
            qas = []
            question = test.iloc[i, 0]
            if type(context) != str or type(question) != str:
                print(context, type(context))
                print(question, type(question))
                continue
            answers = []
            answers.append({'answer_start': 1000000, 'text': '__None__'})
            qas.append({'question': question, "id":hash(question + str(np.random.random())), 'is_impossible': False, 'answers': answers})
            output.append({'context': context.lower(), 'qas': qas})
        output = self.predict(output)
            
        return list(map(lambda x: x['answer'], output))
        
    def _find_all(self, input_str, search_str):
        l1 = []
        length = len(input_str)
        index = 0
        while index < length:
            i = input_str.find(search_str, index)
            if i == -1:
                return l1
            l1.append(i)
            index = i + 1
        return l1


        
       
            

In [81]:
model = QAModel(use_cuda=False, args={'reprocess_input_data': True,
                                     'overwrite_output_dir': True,
                                     'learning_rate': 5e-5,
                                     'num_train_epochs': 3,
                                     'max_seq_length': 192,
                                     'doc_stride': 64,
                                     'fp16': False,
                                      
                                    })

In [70]:
model.fit(data, ['sentiment', 'selected_text', 'text'])

KeyboardInterrupt: 

In [135]:
!rm -rf outputs/

In [82]:
model.make_prediction(data.loc[:100, [ 'sentiment','text']])




  0%|          | 0/101 [00:00<?, ?it/s][A[A[A


  1%|          | 1/101 [00:00<00:14,  7.07it/s][A[A[A


100%|██████████| 101/101 [00:00<00:00, 696.72it/s][A[A[A

HBox(children=(IntProgress(value=0, max=13), HTML(value='')))

['if i were going',
 '',
 'my boss is bullying me',
 'leave me alone',
 'sons of ****',
 'smf - some shameless plugging for the best rangers forum on earth',
 '2am feedings for the baby are fun when he is all smiles and coos',
 'soooo high',
 'both of you',
 'cooler',
 'i`m never gonna get my cake',
 '',
 'my sharpie is running dangerously low on ink',
 'i lost my voice.',
 'lg env2',
 'i am sunburned',
 'sigh',
 'sick',
 'gonna miss every one',
 'hes just not that into you',
 'oh marly',
 'can`t wait to have a dragon pet',
 'her family',
 'i thought win7',
 'smh',
 'free fillin` app on my ipod is fun, im addicted',
 'i`m sorry',
 'no internet access to twit',
 'juss came backk from berkeleyy',
 'went to sleep',
 'heavenly',
 'i hope unni will make the audition',
 'it says i am obesed well so much for being unhappy for about 10 minutes.',
 'cute kids',
 'ah',
 'tears for fears',
 'texas',
 'we are really busy today and this coming with with adding tons of new blogs and updates stay tun