In [1]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
def readData(trainPath,testPath):
    train=[]
    test=[]
    train_label=[]
    test_label=[]
    complete=[]
    for line in open(trainPath):
        temp=line.split("\t")
        train_label.append(temp[0])
        train.append(temp[1].strip())

    for line in open(testPath):
        temp=line.split("\t")
        test_label.append(temp[0])
        test.append(temp[1].strip())
    return train,test,train_label,test_label
    
#stop words removal
def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()


In [3]:

def featuresPipeline(data,vector):
    #NER
    #####END OF NER ######
    
    #VECTORIZER
    features=vector.transform(data)
    return features

def parseData(trainData=None,trainLabel=None,testData=None,testLabel=None,vector=None,labelEncoder=None):
    
    training=False
    #check if training data available if yes setup for preprocessing
    if trainData!=None:
        stopwords_nltk = set(stopwords.words("english"))
        relevant_words = set(['not', 'nor', 'no', 'wasn', 'ain', 'aren', 'very', 'only', 'but', 'don', 'isn', 'weren'])
        stopwords_filtered = list(stopwords_nltk.difference(relevant_words))
        vectorizer = CountVectorizer(stop_words =  stopwords_filtered, max_features = 5000, ngram_range = (1,3))
        vector = vectorizer.fit(trainData+testData)
        labelEncoder=preprocessing.LabelEncoder()
        labelEncoder=labelEncoder.fit(trainLabel+testLabel)
    
    #transform training Data 
    if trainData !=None:
        training=True
        train_features=featuresPipeline(trainData,vector)
        train_encoded_labels=labelEncoder.transform(train_label)
    
    #transform test Data
    test_features=featuresPipeline(testData,vector)
    test_encoded_labels=[]
    if testLabel!=None:
        test_encoded_labels=labelEncoder.transform(test_label)
    
    if training:
        return (train_features,train_encoded_labels),(test_features,test_encoded_labels),vector,labelEncoder
    
    return (test_features,test_encoded_labels)
        

def trainModel(data):
    clf = svm.LinearSVC()
    clf.fit(data[0], data[1])
    return clf

def prediction(data,model):
    predict = model.predict(data)
    return predict

def performanceMetric(data,model):
    
    predict=prediction(data[0],model)
    test_encoded_labels=data[1]
    print "accuracy: ",accuracy_score(test_encoded_labels,predict)
    print "F1-Score ",f1_score(test_encoded_labels,predict,average='macro')
#     print "Precision ", precision_score(test_encoded_labels,predict,average='macro')
#     print "Recall ", recall_score(test_encoded_labels,predict,average='macro')    
#     print confusion_matrix(test_encoded_labels,predict)

def getPredictionPipepline(testDataList,model,vector,labelEncoder):
    
    testFeatures=parseData(testData=testDataList,vector=vector)
    predict=prediction(testFeatures[0],model)
    label=labelEncoder.inverse_transform(predict)
    return label

In [4]:
train,test,train_label,test_label=readData("Data/intentClassificationData_Train","Data/intentClassificationData_Test")
trainData,testData,vector,labelEncoder=parseData(trainData=train,trainLabel=train_label,testData=test,testLabel=test_label)
model=trainModel(trainData)

In [5]:
performanceMetric(testData,model)

accuracy:  0.941769316909
F1-Score  0.65600516971


  'precision', 'predicted', average, warn_for)


In [7]:
testDataList=["how far is it from orlando airport to orlando"]
getPredictionPipepline(testDataList,model,vector,labelEncoder)[0]

'atis_distance'

In [8]:
testDataList=["which flights leave chicago next tuesday and arrive in detroit around 6 pm"]
getPredictionPipepline(testDataList,model,vector,labelEncoder)[0]

'atis_flight'

In [9]:
#TRUE LABEL atis_flight
testDataList=["show me the connecting flights between boston and denver and the types of aircraft used"]
getPredictionPipepline(testDataList,model,vector,labelEncoder)[0]

'atis_aircraft'

In [10]:
#TRUE LABEL atis_airline
testDataList=["show me airlines that have flights between toronto and detroit between detroit and st. louis and between st. louis and toronto"]
getPredictionPipepline(testDataList,model,vector,labelEncoder)[0]

'atis_flight'