## Notebook to perform intent classification on the data from  https://github.com/clinc/oos-eval
### Use data_full.json file 

### Imports

In [1]:
import json
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

nltk.download( 'stopwords' )

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nikunjkotecha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load the data

In [3]:
class TextData:
    def __init__( self, file ):
        self.file = file
    
    def read_json( self, set='train', cols=['sent'], target='intent' ):
        '''
        read the json file and obtained the in-scope set
        '''
        with open( self.file, 'r' ) as f:
            data = json.load( f )
        df = pd.DataFrame( data[set], columns=cols+[target] )
        return df
    
    def random_labels( self, target='intent', n=20, seed=0 ):
        '''
        choose 'n' random intent classes
        '''
        df = self.read_json( target=target )
        np.random.seed( seed )
        labels = np.random.choice( df[target].unique(), size=n, replace=False )
        labels = { val:idx for idx,val in enumerate( labels ) }
        return labels
    
    def get_set( self, set, labels, target='intent' ):
        '''
        obtained the in-scope set of given intent classes
        '''
        df = self.read_json( set=set )
        df = df[ df[target].isin( labels.keys() ) ].reset_index( drop=True )
        return df
    
file = 'data_full.json'
data = TextData( file )
# obtain random intent classes
labels = data.random_labels()

# get the training and validation set
train = data.get_set( 'train', labels )
val = data.get_set( 'val', labels )
train.head()

Unnamed: 0,sent,intent
0,does ireland have any travel alerts i should b...,travel_alert
1,does north korea have any travel alerts i shou...,travel_alert
2,are there any travel alerts for russia,travel_alert
3,does spain have any travel alerts i should be ...,travel_alert
4,are there any travel alerts for north korea,travel_alert


### Data Cleaning

In [5]:
class TextCleaning:
    def __init__( self ):
        # punctuations, stop word, lemmatizer
        self.punctuations = string.punctuation
        self.stop_words = stopwords.words( 'english' )
        self.lemmatizer = WordNetLemmatizer()

    def process( self, sentence ):
        '''
        remove punctuations, stop word to a given sentence
        then lemmatize each word
        '''
        temp = []
        for w in word_tokenize( sentence ):
            if w not in self.punctuations and w not in self.stop_words:
                w = self.lemmatizer.lemmatize(w)
                if w not in temp:
                    temp.append(w)
        return ' '.join( temp )

    def preprocess( self, df, col='sent' ):
        '''
        accept a dataframe and preprocess its sentences
        '''
        # clean the data
        df['clean'] = df.apply( lambda row: self.process( row[col] ), axis=1 )
        return df

data_cleaning = TextCleaning()
# clean the training and validation set
train = data_cleaning.preprocess( train )
val = data_cleaning.preprocess( val )
train.head()

Unnamed: 0,sent,intent,clean
0,does ireland have any travel alerts i should b...,travel_alert,ireland travel alert aware
1,does north korea have any travel alerts i shou...,travel_alert,north korea travel alert aware
2,are there any travel alerts for russia,travel_alert,travel alert russia
3,does spain have any travel alerts i should be ...,travel_alert,spain travel alert aware
4,are there any travel alerts for north korea,travel_alert,travel alert north korea
