In [210]:
import pandas as pd

#define constant
labelAConstant = "zclass_A"
labelBConstant = "zclass_B"
labelEConstant = "zclass_E"
labelVConstant = "zclass_V"

#constant used in model dataframe
totalLabel = "total"

CONSTANT_LABEL_ABSTRACT = "abstract"
CONSTANT_LABEL_CLASS = "class"

classNameList = [labelAConstant, labelBConstant, labelEConstant, labelVConstant]

# Display the first 10 rows
#text_df = pd.read_csv("./trg.csv")
#text_df.head(10)



In [211]:
from collections import Counter
import numpy as np

"""
Retrieve worlds from abstract. Manipulate words if required
"""
def retrieveWordListFromAbstract(abstract):
    stopwords = ["ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while", "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"]
    redudancyWords = ["IS", "A", "THE", "OF", "AND", "IN", "TO"]
    words = [x.strip().lower() for x in abstract.split(" ")]
    words = [ x for x in words if x != ""] 
    words = [ x for x in words if not(x in stopwords)] 
    
    return words

"""
Create default dictionary data for featureList
return a dictionary with key is feature and value is 0
""" 
def createDefaultDictionaryFromFeatureList(featureList):
    result = {}
    for feature in featureList:
        result[feature] = 0
    return result    

"""
Map abstractData to an dictionary which has key is feature 
and value = 1 if that feature appear in abstractData, otherwise value = 0
"""
#transform each abstract data to attribute list
def transformData(abstractData, featureList):
    words_abstract = retrieveWordListFromAbstract(abstractData)
    unique_words = np.unique(words_abstract)
    
    #create default value for dictionary list
    dictionaryResult = createDefaultDictionaryFromFeatureList(featureList)
    
    # assign the feature which appear in abstract data
    for index in range(0, len(unique_words)):
        word = unique_words[index]
        foundInList = word in featureList
        if foundInList:
            dictionaryResult[word] = 1            
    
    return dictionaryResult;

"""
Create featureList by composing the 1000 most frequency keywords in abstractDataList
"""
def get1000MostFrequencyFeatureFrom(abstractDataList):
    words = []
    for index in range(0, abstractDataList.size):
        abstract = abstractDataList[index]
        words_abstract = retrieveWordListFromAbstract(abstract)
        words.extend(words_abstract)
    occurenceList = Counter(words)
    
    #get 1000 most common objects with their occurences
    #most_1000_frequency = occurenceList.most_common(1000)
    most_1000_frequency = occurenceList.most_common(100)
    
    features = np.empty(len(most_1000_frequency), dtype=object)
    #features = []
    # withraw all the keywords
    for index in range(0, len(most_1000_frequency)):
        features[index] = most_1000_frequency[index][0]
        #features.append(most_1000_frequency[index][0])
    
    #return features
    return features.tolist()

"""
Create featureList by composing all words which appear in abstractList
"""
def retrieveFeatureFrom(abstractDataList):
    words = []
    for index in range(0, abstractDataList.size):
        abstract = abstractDataList[index]
        words_abstract = retrieveWordListFromAbstract(abstract)
        words.extend(words_abstract)

    words = list(np.sort(np.unique(words)))
    
    return words


# Create dataset for training from featureList and abstractList
def buildTrainingDataset(featureList, abstractList):
    dataFrame = {}
    aLength = len(abstractList)
    fLength = len(featureList)
    
    #create default list
    for featureIndex in range(0, fLength):
        featureName = featureList[featureIndex]
        dataFrame[featureName] = []
        #dataFrame[featureName]= np.empty(aLength, dtype=object)
        
    for index in range(0, aLength):
        abstract = abstractList[index]
        dictionaryForAbstract = transformData(abstract, featureList)
        
        for typleFeatureItem in dictionaryForAbstract.items():
            featureName = typleFeatureItem[0]
            featureValue = typleFeatureItem[1]
            
            dataFrame[featureName].append(featureValue)
            
    return dataFrame


def buildTestingDataset(featureList, abstractList):
    dataFrame = {}
    aLength = len(abstractList)
    fLength = len(featureList)
    
    #create default list
    for featureIndex in range(0, fLength):
        featureName = featureList[featureIndex]
        dataFrame[featureName] = []
        #dataFrame[featureName]= np.empty(aLength, dtype=object)
        
    for index in range(0, aLength):
        abstract = abstractList[index]
        dictionaryForAbstract = transformData(abstract, featureList)
        
        for typleFeatureItem in dictionaryForAbstract.items():
            featureName = typleFeatureItem[0]
            featureValue = typleFeatureItem[1]
            
            dataFrame[featureName].append(featureValue)
            
    return dataFrame

def buildClass(classList):
    uniqueClassList = np.unique(classList)
    noClass = len(classList)
    resultValue = {}
    
    #assign default value
    for classValue in uniqueClassList:
        tempList = np.zeros(noClass, dtype = int)
        resultValue["zclass_"+classValue] = tempList.tolist()
        
    #assign the real value for resultValue
    for index in range(0, noClass):
        classValue = classList[index]
        resultValue["zclass_" + classValue][index] = 1
    return resultValue

In [251]:
import pandas as pd
import numpy as np
import math

def countFeatureAndClassByValue(featureName, featureValue, className, classValue, featureDF, classDF):
    nRows = featureDF.shape[1]
    count = 0
    for index in range(0, nRows):
        fValue = featureDF[featureName][index]
        cValue = classDF[className][index]
        if fValue == featureValue and cValue == classValue:
            count = count + 1
    return count


# Find the probability of featureName = featureValue given className = 1 with featuresDF and classDF
# featureValue = 1 or 0
def calculateProbabilityOf(featureName, featureValue, className, classValue, trainDF):
    noClass = np.count_nonzero(trainDF[className])
    condition = (trainDF[featureName] == featureValue) & (trainDF[className] == classValue)
    count = trainDF[condition].shape[0]
    
    return count/noClass;
    

def retrieveClassHasValueAtIndex(index, classDF):
    rows = classDF.iloc[index]
    columnNames = classDF.columns.values
    for className in columnNames:
        if rows.at[className] == 1:
            return className
    return 0

# retrieve total number of all the class
def getTotalInModel(model):
    totalSeries = model.loc[totalLabel]
    return totalSeries.sum()

def findMaxIndexInList(list):
    return list.index(max(list))
    
# train
def train(featuresDF, classDF):
    #calculate probability of each class
    nTotal = featuresDF.shape[0]

    # calculate
    classList = classDF.columns.values
    tempList = list(map(lambda x: [x + "=0", x + "=1"], classList))
    columnsNameList = [item for sublist in tempList for item in sublist]
    
    #calculate indexList
    featureList = featuresDF.columns.values
    tempList = list(map(lambda x: [x + "=0", x + "=1"], featureList))
    indexList = [item for sublist in tempList for item in sublist]
    #indexList.append(totalLabel)
    

    #compose the target model
    result = np.zeros((len(indexList), len(columnsNameList)), dtype = int)
    resultDF = pd.DataFrame(result , columns = columnsNameList, index=indexList)
    
    for rowIndex in range(0, nTotal):
        #get active class for this row
        #activeClassAtIndex = retrieveClassHasValueAtIndex(rowIndex, classDF)
        
        #calculate value for feature 
        for featureName in featureList:
            featureValue = featuresDF[featureName][rowIndex]
            rowName = featureName + "=" + str(featureValue)
            for className in classList:
                classValue = classDF[className][rowIndex]
                columnName = className + "=" + str(classValue)
                resultDF[columnName][rowName] += 1 
            
    resultDF = resultDF + 1 # increase all occurence by 1 to avoid multiple zero
    
    #calculate total for class
    #for className in columnsNameList:
    #    count = training_df[(training_df[className] == 1)].shape[0]
    #    #each class have category for each feature. so when increasing occurence of feature by 1, we should increase total by 2
    #    resultDF[className][totalLabel] = count + 2 
    
    #return a model
    return resultDF

def predict(testFeatureDF, classList, model):
    result = np.zeros((len(testFeatureDF), len(classList)), dtype = int)
    resultDF = pd.DataFrame(result , columns = classList)
    
    testLen = testFeatureDF.shape[0]
    
    for index in range(0, testLen):
        instance = testFeatureDF.loc[index]
        classPrediction = predictInstance(instance, classList, model)
        
        #assign to resultDF
        for className in classList:
            resultDF[className][index] = classPrediction[className]
        
    return resultDF

# make prediction for instance base on model
# instance is Series object of pandas, and retrieved from the test dataframe
def predictInstance(instance, classList, model):
    classLen = len(classList)
    featureLen = len(instance)
    #totalNo = getTotalInModel(model)
    totalNo = model.sum().sum()
    featureList = instance.index.tolist()
    
    #print("instance", instance)
    
    probabilitiesOfFeatureForClass = np.zeros(classLen, dtype = float).tolist()
    for indexClass in range(0, classLen):
        className = classList[indexClass]
        classValue = className + "=1" #calculate probability for class=1
        #retrieve total number of class
        noClass = model[classValue].sum()
        probabilityOfClass = math.log(noClass) - math.log(totalNo)
        
        #retrieve probability of instance given by className
        productOfCountFeature = 0
        
        for featureName in featureList:
            featureVal = instance[featureName]
            modelRows = featureName + "=" + str(featureVal)
            productOfCountFeature = productOfCountFeature + math.log(model[classValue][modelRows]) - math.log(totalNo)
        
        #update probability of instance given class
        probabilitiesOfFeatureForClass[indexClass] = productOfCountFeature - (featureLen-1)*probabilityOfClass
    
    
    #print("probabilitiesOfFeatureForClass", probabilitiesOfFeatureForClass)
    maxIndex = findMaxIndexInList(probabilitiesOfFeatureForClass)
    
    #compose the result
    result = {}
    for indexClass in range(0, classLen):
        className = classList[indexClass]
        result[className] = 0
        if maxIndex == indexClass:
            result[className] = 1
    
    return result

#calculate accuracy of predicted data
def calculateAccuracy(predictedDF, originalDF):
    totalNo = originalDF.shape[0]
    correctNo = 0
    for rowIndex in range(0, totalNo):
        originalRowItem = originalDF.loc[rowIndex]
        predictedRowItem = predictedDF.loc[rowIndex]
        if originalRowItem.equals(predictedRowItem):
            correctNo += 1
    
    #print("totalNo ", totalNo)
    #print("correctNo ", correctNo)
    return correctNo / totalNo

def cross_validation_split(dataset, foldValue):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / foldValue)
    for i in range(foldValue):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split

# doing cross validating for trainingDF
def kCrossValidate(trainingDF, classList, foldValue):
    featureDF = trainingDF.drop(classList, axis=1)
    classDF = trainingDF[classList]
    
    #create index list to make validation split
    data = list(range(trainingDF.shape[0]))
    foldIndexList = cross_validation_split(data, foldValue)
    
    result = []
    
    for index in range(0, len(foldIndexList)):
    #for index in range(0, 1):
        foldIndexItemList = foldIndexList[index]
        trainingFeatureDF = featureDF.drop(foldIndexItemList)
        trainingFeatureDF.index = range(trainingFeatureDF.shape[0])
        trainingClassDF = classDF.drop(foldIndexItemList)
        trainingClassDF.index = range(trainingClassDF.shape[0])
        
        #get testing data
        testFeatureDF = featureDF.loc[foldIndexItemList]
        testFeatureDF.index = range(testFeatureDF.shape[0])
        testClassDF = classDF.loc[foldIndexItemList]
        testClassDF.index = range(testClassDF.shape[0])
        
        #create model
        model = train(trainingFeatureDF, trainingClassDF)
        
        prediction = predict(testFeatureDF, classNameList, model)
        accuracy = calculateAccuracy(prediction, testClassDF)
        
        print("accuracy ", accuracy)
        
        outputItem = {
            "model" : model,
            "accuracy": accuracy
        }
        
        result.append(outputItem)
    
    return result

# utilities to retrieve information from model training list
def retrieve_mean_accuracy(outputs):
    totalAccuracy = 0
    for index in range(0, len(outputs)):
        resultItem = outputs[index]
        totalAccuracy += resultItem["accuracy"]
    averageAccuracy = totalAccuracy / len(result)
    return averageAccuracy

def retrieve_model_have_max_accuracy(outputs):
    accuracyList = list(map(lambda x: x["accuracy"], outputs))
    indexMax = accuracyList.index(max(accuracyList))
    return outputs[indexMax]["model"]

def generate_prediction_output(predictionDF):
    result = {
        "id": list(range(predictionDF.shape[0])),
        "class": []
    }
    for index in range(0, predictionDF.shape[0]):
        rowItem = predictionDF.loc[index]
        classList = rowItem.index.tolist()
        valueList = rowItem.values.tolist()
        maxIndex = valueList.index(max(valueList))
        
        if classList[maxIndex] == labelAConstant:
            predictedValue = "A"
        elif classList[maxIndex] == labelBConstant:
            predictedValue = "B"
        elif classList[maxIndex] == labelEConstant:
            predictedValue = "E"
        else:
            predictedValue = "V"
        result["class"].append(predictedValue)
        
    return pd.DataFrame(result)
    

In [252]:
bestModel = retrieve_model_have_max_accuracy(result)
#print(bestModel)
#make prediction for testing data
testData = pd.read_csv("./tst.csv")
testDataSet = buildTestingDataset(featureList, testData[CONSTANT_LABEL_ABSTRACT])
testFeatureDF = pd.DataFrame(testDataSet)
#testFeatureDF

predictionDF = predict(testFeatureDF, classNameList, bestModel)
outputDF = generate_prediction_output(predictionDF)
print(outputDF)
outputDF.to_csv("./output/output.csv", index=False)


      id class
0      0     B
1      1     E
2      2     V
3      3     E
4      4     E
..   ...   ...
995  995     B
996  996     E
997  997     B
998  998     B
999  999     B

[1000 rows x 2 columns]


In [213]:
# main program
import pandas as pd

# build training dataset base on the csv training file
text_df = pd.read_csv("./trg.csv")

print("step1 : build feature")
featureList = get1000MostFrequencyFeatureFrom(text_df[CONSTANT_LABEL_ABSTRACT])
#featureList = retrieveFeatureFrom(text_df["abstract"])

print("step2: build training dataset from feature")
featureDataSet = buildTrainingDataset(featureList, text_df[CONSTANT_LABEL_ABSTRACT])
classDataSet = buildClass(text_df[CONSTANT_LABEL_CLASS])
trainingDataSet = {**featureDataSet, **classDataSet}
trainingDF = pd.DataFrame(trainingDataSet)

print("step3: make crossValidate on training")
results = kCrossValidate(trainingDF, classNameList, 9)

print("step4: retrieve mean accuracy and best model")
meanAccuracy = retrieve_mean_accuracy(results)
bestModel = retrieve_model_have_max_accuracy(results)

print("meanAccuracy ", meanAccuracy)

print("step5: make prediction for test data")
#make prediction for testing data
testData = pd.read_csv("./tst.csv")
testDataSet = buildTestingDataset(featureList, testData[CONSTANT_LABEL_ABSTRACT])
testFeatureDF = pd.DataFrame(testDataSet)
predictionDF = predict(testFeatureDF, classNameList, bestModel)

print("step6: output the prediction and write to file")
outputDF = generate_prediction_output(predictionDF)
outputDF.to_csv("./output/output.csv", index=False)

step1 : build feature
step2: build training dataset from feature
step3: make crossValidate on training
accuracy  0.8175675675675675
accuracy  0.8018018018018018
accuracy  0.8063063063063063
accuracy  0.8220720720720721
accuracy  0.8220720720720721
accuracy  0.8108108108108109
accuracy  0.8175675675675675
accuracy  0.8175675675675675
accuracy  0.8063063063063063
step4: retrieve mean accuracy
averageAccuracy  0.8135635635635635


In [199]:
result = kCrossValidate(training_df, classNameList, 9)

totalAccuracy = 0
for index in range(0, len(result)):
    resultItem = result[index]
    totalAccuracy += resultItem["accuracy"]
averageAccuracy = totalAccuracy / len(result)
print("averageAccuracy ", averageAccuracy)

accuracy  0.3333333333333333
accuracy  0.6666666666666666
accuracy  0.6666666666666666
accuracy  0.3333333333333333
accuracy  0.3333333333333333
accuracy  0.0
accuracy  0.3333333333333333
accuracy  0.0
accuracy  1.0
averageAccuracy  0.4074074074074074


In [151]:
import pandas as pd

training_df = pd.read_csv("./data/test_data.csv")
training_df.shape

cross_validation_training_feature_df = training_df.drop(classNameList, axis=1)
cross_validation_traing_class_df = training_df[classNameList]

#condition = (training_df["treatment"] == 1) & (training_df["zclass_A"] == 1)
#value = training_df[condition]
#value.shape

#classM = retrieveClassHasValueAtIndex(1, cross_validation_traing_class_df)
#classM

model = train(cross_validation_training_feature_df, cross_validation_traing_class_df)
#model

test_df = pd.read_csv("./data/test_data.csv")
cross_validation_test_feature_df = training_df.drop(classNameList, axis=1)
cross_validation_test_class_df = training_df[classNameList]

prediction = predict(cross_validation_test_feature_df, classNameList, model)
print("prediction")
print(prediction)

accuracy = calculateAccuracy(prediction, cross_validation_test_class_df)
accuracy

instance HOMOLOGY    0
HERE        0
USING       0
Name: 0, dtype: int64
probabilitiesOfFeatureForClass  [-8.46632086104248, -5.885867317044979, -6.096013544762939, -8.296421824247084]
instance HOMOLOGY    1
HERE        0
USING       0
Name: 1, dtype: int64
probabilitiesOfFeatureForClass  [-8.46632086104248, -7.352204385838404, -8.660962902224476, -8.989569004807027]
instance HOMOLOGY    0
HERE        0
USING       0
Name: 2, dtype: int64
probabilitiesOfFeatureForClass  [-8.46632086104248, -5.885867317044979, -6.096013544762939, -8.296421824247084]
instance HOMOLOGY    0
HERE        0
USING       0
Name: 3, dtype: int64
probabilitiesOfFeatureForClass  [-8.46632086104248, -5.885867317044979, -6.096013544762939, -8.296421824247084]
instance HOMOLOGY    0
HERE        0
USING       0
Name: 4, dtype: int64
probabilitiesOfFeatureForClass  [-8.46632086104248, -5.885867317044979, -6.096013544762939, -8.296421824247084]
instance HOMOLOGY    0
HERE        1
USING       0
Name: 5, dtype: int64
pr

0.5517241379310345

In [145]:
model

Unnamed: 0,zclass_A=0,zclass_A=1,zclass_B=0,zclass_B=1,zclass_E=0,zclass_E=1,zclass_V=0,zclass_V=1
HOMOLOGY=0,26,2,15,13,15,13,26,2
HOMOLOGY=1,3,2,2,3,4,1,4,1
HERE=0,24,3,14,13,16,11,25,2
HERE=1,5,1,3,3,3,3,5,1
USING=0,26,2,13,15,17,11,26,2
USING=1,3,2,4,1,2,3,4,1


In [165]:
from random import seed
from random import randrange
 
# Split a dataset into k folds
def cross_validation_split(dataset, folds=3):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds)
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split
 
# test cross validation split
#seed(1)
#dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]
#folds = cross_validation_split(dataset, 3)
#print(folds)


data = range(10)

folds = cross_validation_split(list(data), 3)
print(len(folds))
print(len(folds[0]))
print(folds)

3
3
[[5, 4, 6], [8, 2, 9], [7, 3, 1]]


In [167]:
import random

random.seed(10)
print(random.random())

random.seed(10)
print(random.random())

0.5714025946899135
0.5714025946899135


In [66]:
aa = np.zeros(10, dtype = float)
aa[1] = 1
aa.max()

1.0

In [143]:
#featureTest = ["a1", "a2", "a3"]
#defaultValues = createDefaultDictionaryFromFeatureList(featureTest)
#defaultValues

featureList = get1000MostFrequencyFeatureFrom(text_df["abstract"])
#featureList = retrieveFeatureFrom(text_df["abstract"])
#print("featureList", featureList)

featureDataSet = buildTrainingDataset(featureList, text_df["abstract"])
classDataSet = buildClass(text_df["class"])
trainingDataSet = {**featureDataSet, **classDataSet}
df = pd.DataFrame(trainingDataSet)

testData = pd.read_csv("./tst.csv")
testDataSet = buildTestingDataset(featureList, testData["abstract"])
tf = pd.DataFrame(testDataSet)

df.to_csv("./data/training_100_1_feature_data.csv", index=False)
tf.to_csv("./data/test_100_1_feature_data.csv", index=False)

#df
#df.to_csv("./training_data.csv", index=False)
#print("test", test)

#transform data for abstract data
#dictionaryList = transformData(text_df["abstract"][0], featureList)
#dictionaryList
#df = pd.DataFrame(dictionaryList)

#test = retrieveFeatureFrom(text_df["abstract"])
#len(test)

In [341]:
import numpy as np 
import pandas as pd

text_df1 = pd.read_csv("./trg_copy.csv")
words = []

for i in range(0, text_df1.shape[0]):
    abstract = text_df1["abstract"][i]
    words_abstract = abstract.split(" ") # Split when there is a space
    words_abstract = [x.strip() for x in words_abstract]
    words_abstract = [ x for x in words_abstract if x != ""]
    
    words.extend(words_abstract)

#words = list(np.unique(np.sort(words)))    
words = list(np.sort(np.unique(words)))
words
#text_df1["abstract"]

["'",
 "'87",
 "'a",
 "'a'",
 "'addiction'",
 "'ancient'",
 "'b'",
 "'bird",
 "'birds",
 "'c3-type'",
 "'captive'",
 "'chaperonins'",
 "'classical'",
 "'core'",
 "'cysteine-rich'",
 "'deadh'",
 "'destabilization",
 "'dinosaur",
 "'ets",
 "'feral'",
 "'heat-shock'",
 "'hitch-hiking'",
 "'housekeeping'",
 "'immune'",
 "'inflammatory",
 "'insert'",
 "'inserted'",
 "'king",
 "'lcp30'",
 "'lego",
 "'lightning-strike",
 "'mdr-like'",
 "'motor",
 "'mr",
 "'neutralized'",
 "'occluding",
 "'open",
 "'open'",
 "'plasticity",
 "'processing'",
 "'pstaire'",
 "'shotgun'",
 "'silent'",
 "'tail'",
 "'tissue-specific'",
 "'transcriptional",
 "'ttp'",
 "'uptake'",
 "'wolf'",
 '-',
 '--',
 '-1',
 '-10',
 '-1055',
 '-11',
 '-116',
 '-124',
 '-126',
 '-130',
 '-13548',
 '-1363',
 '-13859',
 '-14',
 '-142',
 '-1421',
 '-147',
 '-149',
 '-14q31',
 '-15',
 '-1561',
 '-16',
 '-17',
 '-18',
 '-180',
 '-19',
 '-1a2',
 '-1a2-like',
 '-2',
 '-20',
 '-200',
 '-21',
 '-210',
 '-213',
 '-215',
 '-219',
 '-220',
 '-2

In [343]:
text_boolean_df = pd.DataFrame( np.zeros((text_df1.shape[0], len(words) + 4), dtype = int) , columns = words + ["zzA", "zzB", "zzE", "zzV"])
#text_boolean_df["class"] = text_df1["class"]
for index in range(0, 1):
    classValue = text_df1["class"][index]
    if classValue == "A":
        text_boolean_df["zzA"][index] = 1
    elif classValue == "B":
        text_boolean_df["zzB"][index] = 1
    elif classValue == "E":
        text_boolean_df["zzE"][index] = 1
    else:
        text_boolean_df["zzV"][index] = 1
    
#for i in range(0, text_df1.shape[0]):
for i in range(0, 1):    
    abstract = text_df1["abstract"][i]
    words_abstract = abstract.split(" ") # Split when there is a space
    words_abstract = [x.strip() for x in words_abstract]
    words_abstract = [ x for x in words_abstract if x != ""]
    #print("words_abstract ", words_abstract)
    for j in range(0, len(words_abstract)):
        word = words_abstract[j]
        #text_boolean_df[word][i] = 1
        text_boolean_df.loc[i, word] = 1
text_boolean_df.head()
#text_boolean_df.to_csv("./test_123.csv", index=False)

Unnamed: 0,','87,'a,'a','addiction','ancient','b','bird,'birds,'c3-type',...,zymogen,zymogen-type,zymogens,zymogram,zymography,zymomonas,zzA,zzB,zzE,zzV
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# small tip to work with python

"""
work with list
"""
# find object in list
fruits = ['apple', 'banana', 'cherry']
found = not ("cherry1" in fruits)
found # found is TRUE FALSE

# map list 
my_string = "blah, lots  ,  of ,  spaces, here "
result = [x.strip() for x in my_string.split(',')]
result # remove all white space in the front and back.

# sort, remove duplicated value in list
words = list(np.unique(np.sort(words)))

"""
work with file
"""
# dataframe to csv file
text_df.to_csv("/Users/mac/Desktop/NZ/UOA/COMPSCI_361/assignments/assignment_05/test.csv", index=False) #not include row name
text_df1 = pd.read_csv("./trg_copy.csv") # read csv file

# find most 100 frequency words
occurenceList = Counter(words)
most_1000_frequency = occurenceList.most_common(100)

"""
work with dictionary
"""
car = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}

x = car.items() # return a list of Tuple object from dictionary

"""
work with list
"""

#using map 
arr = ["zclass_A", "zclass_B", "zclass_E", "zclass_V"]
l = list(map(lambda x: [x + "=0", x + "=1"], arr))

#using flat function
flat_list = [item for sublist in l for item in sublist]
flat_list

"""
work with panda
"""
print(text_df.shape) # get dimension of dataframe 
print(text_df.loc[0]) # get Series object for certain row by index
print(text_df.loc["a"]) # get Series object for certain row by name
print(text_df.loc[0].index.tolist()) # retrieve columns name list by Series object
print(text_df["aa"]) # get Series object for certain columns by name
text_df["aa"][0] = 1 # assign data for columns "aa", row 0 
df_1 = pd.DataFrame(result , columns = columnsNameList, index=indexList) # create dataframe with result content, columns name list and rowsList

# drop dataframe from list of columns or rows, axis = 1 indicate drop by columns, = 0 indicate drop by rows 
training_df.drop(classNameList, axis=1)

# filter data frame using condition of columns
#filterElm = (training_df["zclass_A"] == 1) & (training_df["HERE"] == 0)
filterElm = (training_df["zclass_B"] == 1)
filter_df = training_df[filterElm]

# calculate sum of all data in dataFrame
a = model.sum().sum()
a = model.sum() # return a list of sum for each columns

np.zeros((3, 2), dtype = int) # create 3 horizontal;2 vertical dimension array with 0 default value 