In [7]:
import pandas as pd
from random import seed
from random import randrange
import numpy as np
import math
from collections import Counter

#define constant for representation dataframe. We add prefix to the class to avoid 
#the features having the item have the same value with class
labelAConstant = "zclass_A"
labelBConstant = "zclass_B"
labelEConstant = "zclass_E"
labelVConstant = "zclass_V"

#constant used in model dataframe
totalLabel = "total"

CONSTANT_LABEL_ABSTRACT = "abstract"
CONSTANT_LABEL_CLASS = "class"

classNameList = [labelAConstant, labelBConstant, labelEConstant, labelVConstant]

In [8]:
"""
Retrieve worlds from abstract. Manipulate words if required
"""
def retrieveWordListFromAbstract(abstract):
    stopwords = ["ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"]
    numberWords = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten"]
    articles = ["a", "the", "is", "many", "also", "many", "among", "present", "previously", "similar", "similarity", "highly", "several", "different", "large", "within", "high", "may", "single", "pairs", "however", "found", "contains", "first", "new", ]
    
    result = []
    words = abstract.split(" ")
    for word in words:
        word = word.replace("'", "").strip().lower()
        isInStopWords = word in stopwords or word in numberWords or word in articles
        isDigitOrEmpty = word.isdigit() or word == "" or len(word) == 1
        if not isDigitOrEmpty and  not isInStopWords:
            result.append(word)

    return result

"""
Create default dictionary data for featureList
return a dictionary with key is feature and value is 0
""" 
def createDefaultDictionaryFromFeatureList(featureList):
    result = {}
    for feature in featureList:
        result[feature] = 0
    return result    

"""
Map abstractData to an dictionary which has key is feature 
and value = 1 if that feature appear in abstractData, otherwise value = 0
"""
#transform each abstract data to attribute list
def transformData(abstractData, featureList):
    words_abstract = retrieveWordListFromAbstract(abstractData)
    unique_words = np.unique(words_abstract)
    
    #create default value for dictionary list
    dictionaryResult = createDefaultDictionaryFromFeatureList(featureList)
    
    # assign the feature which appear in abstract data
    for index in range(0, len(unique_words)):
        word = unique_words[index]
        foundInList = word in featureList
        if foundInList:
            dictionaryResult[word] = 1            
    
    return dictionaryResult;

def transformDataForMNBC(abstractData, featureList):
    words_abstract = retrieveWordListFromAbstract(abstractData)
    #unique_words = np.unique(words_abstract)
    #get occurence of words in the list 
    common_words = Counter(words_abstract).most_common()
    
    #create default value for dictionary list
    dictionaryResult = createDefaultDictionaryFromFeatureList(featureList)
    
    # assign the feature which appear in abstract data
    #for index in range(0, len(unique_words)):
    for index in range(0, len(common_words)):
        #word = unique_words[index]
        word = common_words[index][0]
        foundInList = word in featureList
        if foundInList:
            dictionaryResult[word] = common_words[index][1]            
    
    return dictionaryResult;

"""
Create featureList by composing the 1000 most frequency keywords in abstractDataList
"""
def get1000MostFrequencyFeatureFrom(abstractDataList):
    words = []
    for index in range(0, abstractDataList.size):
        abstract = abstractDataList[index]
        words_abstract = retrieveWordListFromAbstract(abstract)
        words.extend(words_abstract)
    occurenceList = Counter(words)
    
    #get 1000 most common objects with their occurences
    most_1000_frequency = occurenceList.most_common(100)
    
    features = np.empty(len(most_1000_frequency), dtype=object)
    # retrieve all the keywords
    for index in range(0, len(most_1000_frequency)):
        features[index] = most_1000_frequency[index][0]
    
    return features.tolist()

"""
Create featureList by composing all words which appear in abstractList
"""
def retrieveFeatureFrom(abstractDataList):
    words = []
    for index in range(0, abstractDataList.size):
        abstract = abstractDataList[index]
        words_abstract = retrieveWordListFromAbstract(abstract)
        words.extend(words_abstract)

    words = list(np.sort(np.unique(words)))
    
    return words


# Create dataset for training from featureList and abstractList
def buildTrainingDataset(classifier, featureList, abstractList):
    dataFrame = {}
    aLength = len(abstractList)
    fLength = len(featureList)
    
    #create default list
    for featureIndex in range(0, fLength):
        featureName = featureList[featureIndex]
        dataFrame[featureName] = []
        #dataFrame[featureName]= np.empty(aLength, dtype=object)
        
    for index in range(0, aLength):
        abstract = abstractList[index]
        dictionaryForAbstract = classifier.transformData(abstract, featureList)
        
        for typleFeatureItem in dictionaryForAbstract.items():
            featureName = typleFeatureItem[0]
            featureValue = typleFeatureItem[1]
            
            dataFrame[featureName].append(featureValue)
            
    return dataFrame


def buildTestingDataset(classifier, featureList, abstractList):
    dataFrame = {}
    aLength = len(abstractList)
    fLength = len(featureList)
    
    #create default list
    for featureIndex in range(0, fLength):
        featureName = featureList[featureIndex]
        dataFrame[featureName] = []
        #dataFrame[featureName]= np.empty(aLength, dtype=object)
        
    for index in range(0, aLength):
        abstract = abstractList[index]
        dictionaryForAbstract = classifier.transformData(abstract, featureList)
        
        for typleFeatureItem in dictionaryForAbstract.items():
            featureName = typleFeatureItem[0]
            featureValue = typleFeatureItem[1]
            
            dataFrame[featureName].append(featureValue)
            
    return dataFrame

def buildClass(classList):
    uniqueClassList = np.unique(classList)
    noClass = len(classList)
    resultValue = {}
    
    #assign default value
    for classValue in uniqueClassList:
        tempList = np.zeros(noClass, dtype = int)
        resultValue["zclass_"+classValue] = tempList.tolist()
        
    #assign the real value for resultValue
    for index in range(0, noClass):
        classValue = classList[index]
        resultValue["zclass_" + classValue][index] = 1
    return resultValue

In [9]:
def countFeatureAndClassByValue(featureName, featureValue, className, classValue, featureDF, classDF):
    nRows = featureDF.shape[1]
    count = 0
    for index in range(0, nRows):
        fValue = featureDF[featureName][index]
        cValue = classDF[className][index]
        if fValue == featureValue and cValue == classValue:
            count = count + 1
    return count


# Find the probability of featureName = featureValue given className = 1 with featuresDF and classDF
# featureValue = 1 or 0
def calculateProbabilityOf(featureName, featureValue, className, classValue, trainDF):
    noClass = np.count_nonzero(trainDF[className])
    condition = (trainDF[featureName] == featureValue) & (trainDF[className] == classValue)
    count = trainDF[condition].shape[0]
    
    return count/noClass;
    

def retrieveClassHasValueAtIndex(index, classDF):
    rows = classDF.iloc[index]
    columnNames = classDF.columns.values
    for className in columnNames:
        if rows[className] == 1:
            return className
    return "aa"

# retrieve total number of all the class
def getTotalInModel(model):
    totalSeries = model.loc[totalLabel]
    return totalSeries.sum()

def findMaxIndexInList(list):
    return list.index(max(list))
    

In [10]:
#define interface for classifer algorithm
class Classifer:
    def train(self, featuresDF, classDF):
        return None
    def predict(self, testFeatureDF, classList, model):
        return None
    def predictInstance(instance, classList, model):
        return None
    
#implementation of MNBC classifer    
class MNBC(Classifer):
    def transformData(self, abstractData, featureList):
        #print("MNBC transformData")
        words_abstract = retrieveWordListFromAbstract(abstractData)
        #unique_words = np.unique(words_abstract)
        #get occurence of words in the list 
        common_words = Counter(words_abstract).most_common()

        #create default value for dictionary list
        dictionaryResult = createDefaultDictionaryFromFeatureList(featureList)

        # assign the feature which appear in abstract data
        #for index in range(0, len(unique_words)):
        for index in range(0, len(common_words)):
            #word = unique_words[index]
            word = common_words[index][0]
            foundInList = word in featureList
            if foundInList:
                dictionaryResult[word] = common_words[index][1]            

        return dictionaryResult;
    def train(self, featuresDF, classDF):
        #calculate probability of each class
        nTotal = featuresDF.shape[0]

        # calculate
        classList = classDF.columns.values.tolist()

        #calculate indexList
        featureList = featuresDF.columns.values.tolist()
        indexList = featureList.copy()
        indexList.append(totalLabel)

        #compose the target model
        result = np.zeros((len(indexList), len(classList)), dtype = int)
        resultDF = pd.DataFrame(result , columns = classList, index=indexList)

        for rowIndex in range(0, nTotal):
            #get active class for this row
            activeClassName = retrieveClassHasValueAtIndex(rowIndex, classDF)

            #calculate value for feature 
            for featureName in featureList:
                featureValue = featuresDF[featureName][rowIndex]
                resultDF[activeClassName][featureName] += featureValue

        resultDF = resultDF + 1 # increase all occurence by 1 to avoid multiple zero

        #calculate total for class
        for className in classList:
            count = classDF[(classDF[className] == 1)].shape[0]
            #each class have category for each feature. so when increasing occurence of feature by 1, we should increase total by 2
            resultDF[className][totalLabel] = count

        return resultDF

    # predict MNBC
    def predict(self, testFeatureDF, classList, model):
        testLen = testFeatureDF.shape[0]

        result = np.zeros((testLen, len(classList)), dtype = int)
        resultDF = pd.DataFrame(result , columns = classList)

        for index in range(0, testLen):
            instance = testFeatureDF.loc[index]
            classPrediction = self.predictInstance(instance, classList, model)

            #assign to resultDF
            for className in classList:
                resultDF[className][index] = classPrediction[className]

        return resultDF

    # make prediction for instance base on model
    # instance is Series object of pandas, and retrieved from the test dataframe
    # For MNBC
    def predictInstance(self, instance, classList, model):
        classLen = len(classList)
        featureLen = len(instance)
        noAllClass = model.loc["total"].sum()
        featureList = instance.index.tolist()

        probabilitiesOfFeatureForClass = np.zeros(classLen, dtype = float).tolist()
        for indexClass in range(0, classLen):
            className = classList[indexClass]
            #retrieve total number of class
            noOfClass = model[className]["total"]
            probabilityOfClass = noOfClass / noAllClass

            totalClassByFeature = model[className].sum() - noOfClass

            #retrieve probability of instance given by className
            productOfCountFeature = probabilityOfClass
            #print("productOfCountFeature ", productOfCountFeature)
            #print("totalClassByFeature ", totalClassByFeature)

            for featureName in featureList:
                featureOccurence = instance[featureName]
                #print("featureValue", model[className][featureName])
                #print("featureName", featureName)
                if featureOccurence > 0:
                    #print("featureName", featureName)
                    occurence = model[className][featureName]
                    #print("occurence", occurence)
                    productOfCountFeature = productOfCountFeature + featureOccurence*math.log(occurence/totalClassByFeature)

            #update probability of instance given class
            probabilitiesOfFeatureForClass[indexClass] = productOfCountFeature


        #print("probabilitiesOfFeatureForClass", probabilitiesOfFeatureForClass)
        maxIndex = findMaxIndexInList(probabilitiesOfFeatureForClass)
        #print("probabilitiesOfFeatureForClass ", probabilitiesOfFeatureForClass)

        #compose the result
        result = {}
        for indexClass in range(0, classLen):
            className = classList[indexClass]
            result[className] = 0
            if maxIndex == indexClass:
                result[className] = 1

        return result
        
class NBC(Classifer):
    def transformData(self, abstractData, featureList):
        words_abstract = retrieveWordListFromAbstract(abstractData)
        unique_words = np.unique(words_abstract)

        #create default value for dictionary list
        dictionaryResult = createDefaultDictionaryFromFeatureList(featureList)

        # assign the feature which appear in abstract data
        for index in range(0, len(unique_words)):
            word = unique_words[index]
            foundInList = word in featureList
            if foundInList:
                dictionaryResult[word] = 1            

        return dictionaryResult;
    def train(self, featuresDF, classDF):
        #calculate probability of each class
        nTotal = featuresDF.shape[0]

        # calculate
        classList = classDF.columns.values
        tempList = list(map(lambda x: [x + "=0", x + "=1"], classList))
        columnsNameList = [item for sublist in tempList for item in sublist]

        #calculate indexList
        featureList = featuresDF.columns.values
        tempList = list(map(lambda x: [x + "=0", x + "=1"], featureList))
        indexList = [item for sublist in tempList for item in sublist]
        #indexList.append(totalLabel)


        #compose the target model
        result = np.zeros((len(indexList), len(columnsNameList)), dtype = int)
        resultDF = pd.DataFrame(result , columns = columnsNameList, index=indexList)

        for rowIndex in range(0, nTotal):
            #get active class for this row
            #activeClassAtIndex = retrieveClassHasValueAtIndex(rowIndex, classDF)

            #calculate value for feature 
            for featureName in featureList:
                featureValue = featuresDF[featureName][rowIndex]
                rowName = featureName + "=" + str(featureValue)
                for className in classList:
                    classValue = classDF[className][rowIndex]
                    columnName = className + "=" + str(classValue)
                    resultDF[columnName][rowName] += 1 

        resultDF = resultDF + 1 # increase all occurence by 1 to avoid multiple zero

        #calculate total for class
        #for className in columnsNameList:
        #    count = classDF[(classDF[className] == 1)].shape[0]
        #    #each class have category for each feature. so when increasing occurence of feature by 1, we should increase total by 2
        #    resultDF[className][totalLabel] = count + 2 

        #return a model
        return resultDF

    def predict(self, testFeatureDF, classList, model):
        testLen = testFeatureDF.shape[0]
        result = np.zeros((testLen, len(classList)), dtype = int)
        resultDF = pd.DataFrame(result , columns = classList)

        for index in range(0, testLen):
            instance = testFeatureDF.loc[index]
            classPrediction = self.predictInstance(instance, classList, model)

            #assign to resultDF
            for className in classList:
                resultDF[className][index] = classPrediction[className]

        return resultDF

    # make prediction for instance base on model
    # instance is Series object of pandas, and retrieved from the test dataframe
    def predictInstance(self, instance, classList, model):
        classLen = len(classList)
        featureLen = len(instance)
        #totalNo = getTotalInModel(model)
        totalNo = model.sum().sum()
        featureList = instance.index.tolist()

        #print("instance", instance)

        probabilitiesOfFeatureForClass = np.zeros(classLen, dtype = float).tolist()
        for indexClass in range(0, classLen):
            className = classList[indexClass]
            classValue = className + "=1" #calculate probability for class=1
            #retrieve total number of class
            noClass = model[classValue].sum()
            probabilityOfClass = math.log(noClass) - math.log(totalNo)

            #retrieve probability of instance given by className
            productOfCountFeature = 0

            for featureName in featureList:
                featureVal = instance[featureName]
                modelRows = featureName + "=" + str(featureVal)
                productOfCountFeature = productOfCountFeature + math.log(model[classValue][modelRows]) - math.log(totalNo)

            #update probability of instance given class
            probabilitiesOfFeatureForClass[indexClass] = productOfCountFeature - (featureLen-1)*probabilityOfClass


        #print("probabilitiesOfFeatureForClass", probabilitiesOfFeatureForClass)
        maxIndex = findMaxIndexInList(probabilitiesOfFeatureForClass)

        #compose the result
        result = {}
        for indexClass in range(0, classLen):
            className = classList[indexClass]
            result[className] = 0
            if maxIndex == indexClass:
                result[className] = 1

        return result

In [14]:
class KCrossValidation:
    
    #calculate accuracy of predicted data
    def calculateAccuracy(self, predictedDF, originalDF):
        totalNo = originalDF.shape[0]
        correctNo = 0
        for rowIndex in range(0, totalNo):
            originalRowItem = originalDF.loc[rowIndex]
            predictedRowItem = predictedDF.loc[rowIndex]
            if originalRowItem.equals(predictedRowItem):
                correctNo += 1

        return correctNo / totalNo

    def cross_validation_split(self, dataset, foldValue):
        dataset_split = list()
        dataset_copy = list(dataset)
        fold_size = int(len(dataset) / foldValue)
        for i in range(foldValue):
            fold = list()
            while len(fold) < fold_size:
                index = randrange(len(dataset_copy))
                fold.append(dataset_copy.pop(index))
            dataset_split.append(fold)
        return dataset_split

    # doing cross validating for trainingDF
    def kCrossValidate(self, classifier, trainingDF, classList, foldValue):
        featureDF = trainingDF.drop(classList, axis=1)
        classDF = trainingDF[classList]

        #create index list to make validation split
        data = list(range(trainingDF.shape[0]))
        foldIndexList = self.cross_validation_split(data, foldValue)

        result = []

        for index in range(0, len(foldIndexList)):
            foldIndexItemList = foldIndexList[index]
            trainingFeatureDF = featureDF.drop(foldIndexItemList)
            trainingFeatureDF.index = range(trainingFeatureDF.shape[0])
            trainingClassDF = classDF.drop(foldIndexItemList)
            trainingClassDF.index = range(trainingClassDF.shape[0])

            #get testing data
            testFeatureDF = featureDF.loc[foldIndexItemList]
            testFeatureDF.index = range(testFeatureDF.shape[0])
            testClassDF = classDF.loc[foldIndexItemList]
            testClassDF.index = range(testClassDF.shape[0])

            #create model
            print("calculate model at index >> ", index)
            model = classifier.train(trainingFeatureDF, trainingClassDF)

            print("make prediction ", index)
            prediction = classifier.predict(testFeatureDF, classList, model)
            accuracy = self.calculateAccuracy(prediction, testClassDF)

            print("accuracy ", accuracy)

            outputItem = {
                "model" : model,
                "accuracy": accuracy
            }

            result.append(outputItem)

        return result

    # utilities to retrieve information from model training list
    def retrieve_mean_accuracy(self, outputs):
        totalAccuracy = 0
        for index in range(0, len(outputs)):
            resultItem = outputs[index]
            totalAccuracy += resultItem["accuracy"]
        averageAccuracy = totalAccuracy / len(outputs)
        return averageAccuracy

    def retrieve_model_have_max_accuracy(self, outputs):
        accuracyList = list(map(lambda x: x["accuracy"], outputs))
        indexMax = accuracyList.index(max(accuracyList))
        return outputs[indexMax]["model"]

    def generate_prediction_output(self, predictionDF):
        result = {
            "id": list(range(predictionDF.shape[0])),
            "class": []
        }
        for index in range(0, predictionDF.shape[0]):
            rowItem = predictionDF.loc[index]
            classList = rowItem.index.tolist()
            valueList = rowItem.values.tolist()
            maxIndex = valueList.index(max(valueList))

            if classList[maxIndex] == labelAConstant:
                predictedValue = "A"
            elif classList[maxIndex] == labelBConstant:
                predictedValue = "B"
            elif classList[maxIndex] == labelEConstant:
                predictedValue = "E"
            else:
                predictedValue = "V"
            result["class"].append(predictedValue)

        return pd.DataFrame(result)

In [15]:
#pipeline for run the algorithm
def mnbc_train_model(filePath, crossValidation, classifier):
    seed()
    # build training dataset base on the csv training file
    text_df = pd.read_csv(filePath)

    print("step1 : build feature")
    featureList = get1000MostFrequencyFeatureFrom(text_df[CONSTANT_LABEL_ABSTRACT])
    #print("featureList ", featureList)

    print("step2: build training dataset from feature")
    featureDataSet = buildTrainingDataset(classifier, featureList, text_df[CONSTANT_LABEL_ABSTRACT])
    classDataSet = buildClass(text_df[CONSTANT_LABEL_CLASS])
    trainingDataSet = {**featureDataSet, **classDataSet}
    trainingDF = pd.DataFrame(trainingDataSet)

    print("step3: make crossValidate on training")
    results = crossValidation.kCrossValidate(classifier, trainingDF, classNameList, 10)

    print("step4: retrieve mean accuracy and best model")
    meanAccuracy = crossValidation.retrieve_mean_accuracy(results)

    print("meanAccuracy ", meanAccuracy)
    return {
        "results": results,
        "featureList": featureList
    }

def mnbc_predict(testFile, outputFile, model, featureList, crossValidation, classifier):
    print("step5: make prediction for test data")
    #make prediction for testing data
    testData = pd.read_csv(testFile)
    testDataSet = buildTestingDataset(classifier, featureList, testData[CONSTANT_LABEL_ABSTRACT])
    testFeatureDF = pd.DataFrame(testDataSet)
    predictionDF = classifier.predict(testFeatureDF, classNameList, model)

    print("step6: output the prediction and write to file")
    outputDF = crossValidation.generate_prediction_output(predictionDF)
    outputDF.to_csv(outputFile, index=False)

    return 0


In [17]:
# Run NBC algorithm
nbc = NBC()
kCrossValidation = KCrossValidation()

train_models = mnbc_train_model("./trg.csv", kCrossValidation, nbc)
bestModel = kCrossValidation.retrieve_model_have_max_accuracy(train_models["results"])
mnbc_predict("./tst.csv", "./output/output.csv", bestModel, train_models["featureList"], kCrossValidation, nbc)

step1 : build feature
step2: build training dataset from feature
step3: make crossValidate on training
calculate model at index >>  0
make prediction  0
accuracy  0.79
calculate model at index >>  1
make prediction  1
accuracy  0.81
calculate model at index >>  2
make prediction  2
accuracy  0.8025
calculate model at index >>  3
make prediction  3
accuracy  0.8075
calculate model at index >>  4
make prediction  4
accuracy  0.8125
calculate model at index >>  5
make prediction  5
accuracy  0.7775
calculate model at index >>  6
make prediction  6
accuracy  0.8125
calculate model at index >>  7
make prediction  7
accuracy  0.8
calculate model at index >>  8
make prediction  8
accuracy  0.79
calculate model at index >>  9
make prediction  9
accuracy  0.8
step4: retrieve mean accuracy and best model
meanAccuracy  0.8002499999999999
step5: make prediction for test data
step6: output the prediction and write to file


0

In [16]:
#Main method to run MNBC algorithm
mnbc = MNBC()
kCrossValidation = KCrossValidation()

train_models = mnbc_train_model("./trg.csv", kCrossValidation, mnbc)
bestModel = kCrossValidation.retrieve_model_have_max_accuracy(train_models["results"])
mnbc_predict("./tst.csv", "./output/output.csv", bestModel, train_models["featureList"], kCrossValidation, mnbc)

step1 : build feature
step2: build training dataset from feature
step3: make crossValidate on training
calculate model at index >>  0
make prediction  0
accuracy  0.7475
calculate model at index >>  1
make prediction  1
accuracy  0.7375
calculate model at index >>  2
make prediction  2
accuracy  0.765
calculate model at index >>  3
make prediction  3
accuracy  0.755
calculate model at index >>  4
make prediction  4
accuracy  0.7325
calculate model at index >>  5
make prediction  5
accuracy  0.725
calculate model at index >>  6
make prediction  6
accuracy  0.7825
calculate model at index >>  7
make prediction  7
accuracy  0.74
calculate model at index >>  8
make prediction  8
accuracy  0.7875
calculate model at index >>  9
make prediction  9
accuracy  0.75
step4: retrieve mean accuracy and best model
meanAccuracy  0.7522499999999999
step5: make prediction for test data
step6: output the prediction and write to file


0

In [None]:
# main program
import pandas as pd

# build training dataset base on the csv training file
text_df = pd.read_csv("./trg.csv")

print("step1 : build feature")
featureList = get1000MostFrequencyFeatureFrom(text_df[CONSTANT_LABEL_ABSTRACT])
#featureList = retrieveFeatureFrom(text_df["abstract"])

print("step2: build training dataset from feature")
featureDataSet = buildTrainingDataset(featureList, text_df[CONSTANT_LABEL_ABSTRACT])
classDataSet = buildClass(text_df[CONSTANT_LABEL_CLASS])
trainingDataSet = {**featureDataSet, **classDataSet}
trainingDF = pd.DataFrame(trainingDataSet)

print("step3: make crossValidate on training")
results = kCrossValidate(trainingDF, classNameList, 10)

print("step4: retrieve mean accuracy and best model")
meanAccuracy = retrieve_mean_accuracy(results)
bestModel = retrieve_model_have_max_accuracy(results)

print("meanAccuracy ", meanAccuracy)

print("step5: make prediction for test data")
#make prediction for testing data
testData = pd.read_csv("./tst.csv")
testDataSet = buildTestingDataset(featureList, testData[CONSTANT_LABEL_ABSTRACT])
testFeatureDF = pd.DataFrame(testDataSet)
predictionDF = predict(testFeatureDF, classNameList, bestModel)

print("step6: output the prediction and write to file")
outputDF = generate_prediction_output(predictionDF)
outputDF.to_csv("./output/output.csv", index=False)