In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import word2vec
import gensim
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import sklearn.metrics as m
from sklearn.externals import joblib
import logging

  from numpy.core.umath_tests import inner1d


In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

In [3]:
def create_dataframe(filepath):
    """ Parse the text file and create a dataframe with sentences (for input data)
    and all the intents (for target data)"""
    sentences, sentence_intents, unique_intents = [], [], []
    data_dict = {}
    with open(filepath,'r') as f:
        for line in f.readlines():
            sentence = re.findall("(?<=BOS)(.*)(?=EOS)", line)
            sentences.extend(sentence)
            intent = re.findall("(I-\S+)",line)
            sentence_intents.append(intent)
            unique_intents.extend(intent)
    data_dict['sentences'] = sentences
    data_dict['intent_list'] = sentence_intents
    for intent in unique_intents:
        intent_col = [list(set(intent_list)).count(intent) for intent_list in sentence_intents]
        data_dict[intent]=intent_col
    return pd.DataFrame(data_dict), unique_intents

In [16]:
train_data, unique_intents = create_dataframe("./data/atis-2.train.w-intent.iob.txt")

In [17]:
train_data.head()

Unnamed: 0,sentences,intent_list,I-round_trip,I-fare_amount,I-fromloc.city_name,I-arrive_time.time,I-toloc.city_name,I-stoploc.city_name,I-airline_name,I-toloc.airport_name,...,I-depart_date.today_relative,I-fare_basis_code,I-arrive_time.start_time,I-today_relative,I-depart_time.period_of_day,I-arrive_time.time_relative,I-time,I-depart_time.time_relative,I-return_date.today_relative,I-meal_description
0,i want to fly from baltimore to dallas round ...,[I-round_trip],1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,round trip fares from baltimore to philadelph...,"[I-round_trip, I-fare_amount, I-round_trip, I-...",1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,show me the flights arriving on baltimore on ...,[],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,what are the flights which depart from san fr...,"[I-fromloc.city_name, I-arrive_time.time]",0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,which airlines fly from boston to washington ...,[],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
class TextPreProcessor:

    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def tokenize(self, line):
        """ Tokenize every words from sentence"""
        tokens = nltk.word_tokenize(line)
        return tokens

    def to_lower(self, line):
        """ Convert words from line to lower case"""
        words = [word.lower() for word in line]
        return words

    def remove_punctuation(self, line):
        """Remove punctuation from list of tokenized words"""
        new_words = []
        for word in line:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
        return new_words

    def remove_stop_words(self, line):
        """ Remove stop words"""
        words = [word for word in line if word not in stopwords.words('english')]
        return words

    def lemmatize_words(self, line):
        """Lemmatize tokens in list of tokenized words"""
        # lemmatizer = WordNetLemmatizer()
        lemmas = [self.lemmatizer.lemmatize(word) for word in line]
        return lemmas
    
    def clean_text (self, data):
        clean_sent = data['sentences'].apply(lambda x: self.tokenize(x))
        clean_sent = clean_sent.apply(lambda x: self.to_lower(x))
        clean_sent = clean_sent.apply(lambda x: self.remove_punctuation(x))
        clean_sent = clean_sent.apply(lambda x: self.remove_stop_words(x))
        clean_sent = clean_sent.apply(lambda x: self.lemmatize_words(x))
        return list(clean_sent)

In [19]:
preprocess = TextPreProcessor()
clean_sent = preprocess.clean_text(train_data)

In [20]:
print(len(clean_sent))

4478


In [21]:
#Loading google pre-trained model for word2vec
wv = gensim.models.KeyedVectors.load_word2vec_format("./model/GoogleNews-vectors-negative300.bin.gz", binary=True)

In [22]:
wv.init_sims(replace=True)

In [23]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        print("cannot compute similarity with no input %s", words)
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, post) for post in text_list ])

In [24]:
X = word_averaging_list(wv, clean_sent)

  


cannot compute similarity with no input %s ['ewr']
cannot compute similarity with no input %s ['mco']
cannot compute similarity with no input %s []
cannot compute similarity with no input %s ['ewr']
cannot compute similarity with no input %s ['mco']
cannot compute similarity with no input %s ['mco']
cannot compute similarity with no input %s ['ewr']
cannot compute similarity with no input %s ['yyz']
cannot compute similarity with no input %s ['ewr']
cannot compute similarity with no input %s ['mco']
cannot compute similarity with no input %s ['ewr']
cannot compute similarity with no input %s ['bna']


In [13]:
print(X.shape)

(4478, 300)


In [14]:
def model_train(data, X, unique_intents, modelname):
    """ Train a base model with minimal tuning for comparision """
    for intent in unique_intents:
        model = RandomForestClassifier()
        model.fit(X, np.array(data[intent]))
        joblib.dump(model, "./model/{0}_{1}.pkl".format(intent,modelname))

In [9]:
def evaluate_classification(y_true, y_pred):
    """Calculate all the evaluation metrics of classification model"""
    cm = m.confusion_matrix(y_true, y_pred)
    acc = m.accuracy_score(y_true, y_pred)
    prec = m.precision_score(y_true, y_pred)
    recall = m.recall_score(y_true, y_pred)
    f1score = m.f1_score(y_true, y_pred)
    evaluation = {"accuracy": round(float(acc)*100,2),
                  "precision":round(float(prec)*100,2),
                  "recall":round(float(recall)*100,2),
                  "f1_score":round(float(f1score)*100,2)}
    return evaluation, cm

In [10]:
def test_model(modelname, X_test, data,train_intents, test_intents):
    """ Test the model performance on test data"""
    y_pred, confusion_mat, evaluation = {},{},{}
    for intent in train_intents:
        if intent in test_intents:
            model = joblib.load("./model/{0}_{1}.pkl".format(intent, modelname))
            print("./model/{0}_{1}.pkl".format(intent, modelname))
            test_eval, cm = evaluate_classification(np.array(data[intent]), model.predict(X_test))
            confusion_mat[intent] = cm
            evaluation[intent] = test_eval
    return confusion_mat, evaluation
    

In [18]:
model_train(train_data, X, unique_intents, "base_rf")

In [11]:
test_data, unique_intents_test = create_dataframe("./data/atis.test.w-intent.iob.txt")
# comparision = set(unique_intents) -set(unique_intents_test)
# print(comparision)

In [12]:
clean_test_sent = preprocess.clean_text(test_data)
X_test = word_averaging_list(wv, clean_test_sent)

  


In [23]:
con_mat, evaluation = test_model("base_rf", X_test, test_data,unique_intents, unique_intents_test)
print(con_mat)

./model/I-round_trip_base_rf.pkl
{'I-round_trip': array([[822,   0],
       [ 36,  35]], dtype=int64)}


In [24]:
print(pd.DataFrame(evaluation))

           I-round_trip
accuracy          95.97
f1_score          66.04
precision        100.00
recall            49.30


In [23]:
def hyper_parameter():  
    """ Define the hyper parameters for random forest"""
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]\
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 100, num = 11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    return {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [24]:
def model_train_hyper(data, X, unique_intents, modelname):
    """ Train a Random Forest model with tuning the the hyperparameter using random search"""
    for intent in unique_intents:
        rf = RandomForestClassifier()
        rf_random = RandomizedSearchCV(estimator = rf, 
                               param_distributions = hyper_parameter(), cv = 5, 
                               verbose=2, random_state=42, n_jobs = -1)
        rf_random.fit(X, np.array(data[intent]))
        best_model = rf_random.best_estimator_
        joblib.dump(best_model, "./model/{0}_{1}.pkl".format(intent,modelname))


In [None]:
model_train_hyper(train_data, X, unique_intents, "rf_random")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.0min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  7.9min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.7min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  7.9min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.7min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.0min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.7min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.9min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  6.8min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.4min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  5.2min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.3min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.4min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.4min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.3min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  7.0min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  7.0min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.4min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.3min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.3min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  7.7min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  7.7min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  6.4min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  7.9min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.3min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.7min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.3min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.4min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.5min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.3min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  9.4min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.4min finished


Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [13]:
trained_model = ["I-airline_name","I-arrive_time.time","I-class_type","I-cost_relative","I-depart_time.time","I-fare_amount",
                "I-fromloc.airport_name","I-fromloc.city_name","I-round_trip","I-stoploc.city_name","I-toloc.airport_name","I-toloc.city_name"]

In [14]:
con_mat_best, evaluation_best = test_model("rf_random", X_test, test_data, trained_model,unique_intents_test)
print(con_mat_best)

./model/I-airline_name_rf_random.pkl
./model/I-arrive_time.time_rf_random.pkl
./model/I-class_type_rf_random.pkl
./model/I-cost_relative_rf_random.pkl


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


./model/I-depart_time.time_rf_random.pkl
./model/I-fare_amount_rf_random.pkl
./model/I-fromloc.airport_name_rf_random.pkl
./model/I-fromloc.city_name_rf_random.pkl
./model/I-round_trip_rf_random.pkl
./model/I-stoploc.city_name_rf_random.pkl
./model/I-toloc.airport_name_rf_random.pkl
./model/I-toloc.city_name_rf_random.pkl
{'I-airline_name': array([[833,   0],
       [ 44,  16]], dtype=int64), 'I-arrive_time.time': array([[858,   1],
       [ 26,   8]], dtype=int64), 'I-class_type': array([[876,   0],
       [  2,  15]], dtype=int64), 'I-cost_relative': array([[890,   0],
       [  3,   0]], dtype=int64), 'I-depart_time.time': array([[839,   2],
       [ 48,   4]], dtype=int64), 'I-fare_amount': array([[891,   0],
       [  2,   0]], dtype=int64), 'I-fromloc.airport_name': array([[881,   1],
       [ 10,   1]], dtype=int64), 'I-fromloc.city_name': array([[725,   4],
       [133,  31]], dtype=int64), 'I-round_trip': array([[822,   0],
       [ 26,  45]], dtype=int64), 'I-stoploc.city_nam

In [15]:
print(pd.DataFrame(evaluation_best))

           I-airline_name  I-arrive_time.time  I-class_type  I-cost_relative  \
accuracy            95.07               96.98         99.78            99.66   
f1_score            42.11               37.21         93.75             0.00   
precision          100.00               88.89        100.00             0.00   
recall              26.67               23.53         88.24             0.00   

           I-depart_time.time  I-fare_amount  I-fromloc.airport_name  \
accuracy                94.40          99.78                   98.77   
f1_score                13.79           0.00                   15.38   
precision               66.67           0.00                   50.00   
recall                   7.69           0.00                    9.09   

           I-fromloc.city_name  I-round_trip  I-stoploc.city_name  \
accuracy                 84.66         97.09                98.88   
f1_score                 31.16         77.59                 0.00   
precision                88.57 