In [1]:
import pandas as pd

text_df = pd.read_csv("./trg.csv")

#define constant
labelAConstant = "zclass_A"
labelBConstant = "zclass_B"
labelEConstant = "zclass_E"
labelVConstant = "zclass_V"

# Display the first 10 rows
text_df.head(10)


# A simple boolean representation of word appearances
#import numpy as np 

# Find all the words
#words = []
#for i in range(0, text_df.shape[0]):
#for i in range(0, 1):    
#    abstract = text_df["abstract"][i]
#    words_abstract = abstract.split(" ") # Split when there is a space
#    words.extend(words_abstract)

    
# Remove duplicate words and sort them alphabetically
#words = list(np.unique(np.sort(words)))
#words = list(np.unique(words))
#words = list(np.sort(np.unique(words)))



Unnamed: 0,id,class,abstract
0,1,B,the 4 202 353 bp genome of the alkaliphilic ba...
1,2,A,the complete 1751377-bp sequence of the genome...
2,3,E,in 1992 we started assembling an ordered libra...
3,4,E,the aim of this study is to measure human mito...
4,5,B,the amino acid sequence of the spirulina maxim...
5,6,B,the genus xanthomonas is a diverse and economi...
6,7,B,the complete nucleotide sequence of the genome...
7,8,B,the complete genome sequence of caulobacter cr...
8,9,V,the complete dna sequence of the a2 strain of ...
9,10,B,the complete genomic sequence of corynebacteri...


In [2]:
from collections import Counter
import numpy as np

"""
Retrieve worlds from abstract. Manipulate words if required
"""
def retrieveWordListFromAbstract(abstract):
    redudancyWords = ["IS", "A", "THE", "OF", "AND", "IN", "TO"]
    words = [x.strip().upper() for x in abstract.split(" ")]
    words = [ x for x in words if x != ""] 
    words = [ x for x in words if not(x in redudancyWords)] 
    
    return words

"""
Create default dictionary data for featureList
return a dictionary with key is feature and value is 0
""" 
def createDefaultDictionaryFromFeatureList(featureList):
    result = {}
    for feature in featureList:
        result[feature] = 0
    return result    

"""
Map abstractData to an dictionary which has key is feature 
and value = 1 if that feature appear in abstractData, otherwise value = 0
"""
#transform each abstract data to attribute list
def transformData(abstractData, featureList):
    words_abstract = retrieveWordListFromAbstract(abstractData)
    unique_words = np.unique(words_abstract)
    
    #create default value for dictionary list
    dictionaryResult = createDefaultDictionaryFromFeatureList(featureList)
    
    # assign the feature which appear in abstract data
    for index in range(0, len(unique_words)):
        word = unique_words[index]
        foundInList = word in featureList
        if foundInList:
            dictionaryResult[word] = 1            
    
    return dictionaryResult;

"""
Create featureList by composing the 1000 most frequency keywords in abstractDataList
"""
def get1000MostFrequencyFeatureFrom(abstractDataList):
    words = []
    for index in range(0, abstractDataList.size):
        abstract = abstractDataList[index]
        words_abstract = retrieveWordListFromAbstract(abstract)
        words.extend(words_abstract)
    occurenceList = Counter(words)
    
    #get 1000 most common objects with their occurences
    #most_1000_frequency = occurenceList.most_common(1000)
    most_1000_frequency = occurenceList.most_common(100)
    
    features = np.empty(len(most_1000_frequency), dtype=object)
    #features = []
    # withraw all the keywords
    for index in range(0, len(most_1000_frequency)):
        features[index] = most_1000_frequency[index][0]
        #features.append(most_1000_frequency[index][0])
    
    #return features
    return features.tolist()

"""
Create featureList by composing all words which appear in abstractList
"""
def retrieveFeatureFrom(abstractDataList):
    words = []
    for index in range(0, abstractDataList.size):
        abstract = abstractDataList[index]
        words_abstract = retrieveWordListFromAbstract(abstract)
        words.extend(words_abstract)

    words = list(np.sort(np.unique(words)))
    
    return words


# Create dataset for training from featureList and abstractList
def buildTrainingDataset(featureList, abstractList):
    dataFrame = {}
    aLength = len(abstractList)
    fLength = len(featureList)
    
    #create default list
    for featureIndex in range(0, fLength):
        featureName = featureList[featureIndex]
        dataFrame[featureName] = []
        #dataFrame[featureName]= np.empty(aLength, dtype=object)
        
    for index in range(0, aLength):
        abstract = abstractList[index]
        dictionaryForAbstract = transformData(abstract, featureList)
        
        for typleFeatureItem in dictionaryForAbstract.items():
            featureName = typleFeatureItem[0]
            featureValue = typleFeatureItem[1]
            
            dataFrame[featureName].append(featureValue)
            
    return dataFrame


def buildTestingDataset(featureList, abstractList):
    dataFrame = {}
    aLength = len(abstractList)
    fLength = len(featureList)
    
    #create default list
    for featureIndex in range(0, fLength):
        featureName = featureList[featureIndex]
        dataFrame[featureName] = []
        #dataFrame[featureName]= np.empty(aLength, dtype=object)
        
    for index in range(0, aLength):
        abstract = abstractList[index]
        dictionaryForAbstract = transformData(abstract, featureList)
        
        for typleFeatureItem in dictionaryForAbstract.items():
            featureName = typleFeatureItem[0]
            featureValue = typleFeatureItem[1]
            
            dataFrame[featureName].append(featureValue)
            
    return dataFrame

def buildClass(classList):
    uniqueClassList = np.unique(classList)
    noClass = len(classList)
    resultValue = {}
    
    #assign default value
    for classValue in uniqueClassList:
        tempList = np.zeros(noClass, dtype = int)
        resultValue["zclass_"+classValue] = tempList.tolist()
        
    #assign the real value for resultValue
    for index in range(0, noClass):
        classValue = classList[index]
        resultValue["zclass_" + classValue][index] = 1
    return resultValue



In [3]:
import pandas as pd
import numpy as np

def countFeatureAndClassByValue(featureName, featureValue, className, classValue, featureDF, classDF):
    nRows = featureDF.shape[1]
    count = 0
    for index in range(0, nRows):
        fValue = featureDF[featureName][index]
        cValue = classDF[className][index]
        if fValue == featureValue and cValue == classValue:
            count = count + 1
    return count


# Find the probability of featureName = featureValue given className = 1 with featuresDF and classDF
# featureValue = 1 or 0
def calculateProbabilityOf(featureName, featureValue, className, classValue, trainDF):
    noClass = np.count_nonzero(trainDF[className])
    condition = (trainDF[featureName] == featureValue) & (trainDF[className] == classValue)
    count = trainDF[condition].shape[0]
    
    return count/noClass;
    

def retrieveClassHasValueAtIndex(index, classDF):
    rows = classDF.iloc[index]
    columnNames = classDF.columns.values
    for className in columnNames:
        if rows.at[className] == 1:
            return className
    return 0
    
# train
def train(featuresDF, classDF):
    #calculate probability of each class
    nTotal = featuresDF.shape[0]

    # calculate
    columnsNameList = classDF.columns.values
    
    
    #calculate indexList
    featureList = featuresDF.columns.values
    indexList = []
    for index in range(0, len(featureList)):
        featureIsZero = featureList[index] + "=0"
        featureIsOne = featureList[index] + "=1"
        indexList.append(featureIsZero)
        indexList.append(featureIsOne)
    indexList.append("total")
    

    #compose the target model
    result = np.zeros((len(indexList), len(columnsNameList)), dtype = int)
    resultDF = pd.DataFrame(result , columns = columnsNameList, index=indexList)
    
    for rowIndex in range(0, nTotal):
        #calculate value for feature 
        for featureName in featureList:
            featureValue = featuresDF[featureName][rowIndex]
            rowName = featureName + "=" + str(featureValue)
            activeClassAtIndex = retrieveClassHasValueAtIndex(rowIndex, classDF)
            resultDF[activeClassAtIndex][rowName] += 1 
            
    #calculate total for class
    for className in columnsNameList:
        count = training_df[(training_df[className] == 1)].shape[0]
        resultDF[className]["total"] = count
    
    return resultDF

def test(testDF):
    return 0



In [4]:
import pandas as pd

training_df = pd.read_csv("./data/test_data.csv", )
training_df.shape

cross_validation_training_feature_df = training_df.drop([labelAConstant, labelBConstant, labelEConstant, labelVConstant], axis=1)
cross_validation_traing_class_df = training_df[[labelAConstant, labelBConstant, labelEConstant, labelVConstant]]
#cross_validation_test_df[labelAConstant]
#model = train(training_df, [labelAConstant, labelBConstant, labelEConstant, labelVConstant])
#countFeatureAndClassByValue("treatment", 1, "zclass_A", 1, cross_validation_training_feature_df, cross_validation_traing_class_df)
#condition = (training_df["treatment"] == 1) & (training_df["zclass_A"] == 1)
#value = training_df[condition]
#value.shape

#classM = retrieveClassHasValueAtIndex(1, cross_validation_traing_class_df)
#classM

model = train(cross_validation_training_feature_df, cross_validation_traing_class_df)
model


Unnamed: 0,zclass_A,zclass_B,zclass_E,zclass_V
HOMOLOGY=0,1,12,12,1
HOMOLOGY=1,1,2,0,0
HERE=0,2,12,10,1
HERE=1,0,2,2,0
USING=0,1,14,10,1
USING=1,1,0,2,0
total,2,14,12,1


In [507]:
#filterElm = (training_df["zclass_A"] == 1) & (training_df["HERE"] == 0)
filterElm = (training_df["zclass_B"] == 1)
filter_df = training_df[filterElm]
print("filter_df")
print(filter_df)
print("shape")
filter_df.shape

filter_df
    HOMOLOGY  HERE  USING  zclass_A  zclass_B  zclass_E  zclass_V
0          0     0      0         0         1         0         0
4          0     0      0         0         1         0         0
5          0     1      0         0         1         0         0
6          0     0      0         0         1         0         0
7          0     0      0         0         1         0         0
9          0     0      0         0         1         0         0
11         0     0      0         0         1         0         0
12         0     0      0         0         1         0         0
14         1     0      0         0         1         0         0
15         0     1      0         0         1         0         0
16         1     0      0         0         1         0         0
23         0     0      0         0         1         0         0
25         0     0      0         0         1         0         0
26         0     0      0         0         1         0         0


(14, 7)

In [334]:
#test = buildClass(text_df["class"])
#tf_test = pd.DataFrame(test)
#tf_test.to_csv("./class_value.csv", index=False)

In [471]:
#featureTest = ["a1", "a2", "a3"]
#defaultValues = createDefaultDictionaryFromFeatureList(featureTest)
#defaultValues

featureList = get1000MostFrequencyFeatureFrom(text_df["abstract"])
#featureList = retrieveFeatureFrom(text_df["abstract"])
#print("featureList", featureList)

featureDataSet = buildTrainingDataset(featureList, text_df["abstract"])
classDataSet = buildClass(text_df["class"])
trainingDataSet = {**featureDataSet, **classDataSet}
df = pd.DataFrame(trainingDataSet)

testData = pd.read_csv("./tst.csv")
testDataSet = buildTestingDataset(featureList, testData["abstract"])
tf = pd.DataFrame(testDataSet)

df.to_csv("./data/training_100_feature_data.csv", index=False)
tf.to_csv("./data/test_100_feature_data.csv", index=False)

#df
#df.to_csv("./training_data.csv", index=False)
#print("test", test)

#transform data for abstract data
#dictionaryList = transformData(text_df["abstract"][0], featureList)
#dictionaryList
#df = pd.DataFrame(dictionaryList)

#test = retrieveFeatureFrom(text_df["abstract"])
#len(test)

In [469]:
#training_df = pd.read_csv("./training_data.csv")
#training_df[0]
#df["zclass_B"]
df["zclass_A"]

0       0
1       1
2       0
3       0
4       0
       ..
3995    0
3996    0
3997    0
3998    0
3999    0
Name: zclass_A, Length: 4000, dtype: int64

In [167]:
training_df[,0]

SyntaxError: invalid syntax (<ipython-input-167-6683d84fb750>, line 1)

In [110]:
import numpy as np
test = np.zeros((2,3), dtype = int)
#features = np.empty(len(most_1000_frequency), dtype=string)
#a = np.array(42)
#b = np.array([1, 2, 3, 4, 5])
#c = np.array([[1, 2, 3], [4, 5, 6]])
#d = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])

#print(len(a))
#print(b.ndim)
#print(c.ndim)
#print(d.ndim)
test[0][0] = 1
print("test[0]", test[0][0])
print("test" , test)

a = np.zeros(10)
type(a)

import numpy as np
np.zeros((2,3), dtype = int)
np.empty(10, dtype= str)


fruits = ['apple', 'banana', 'cherry']

x = "cherry" in fruits
if x:
    print("found")
else:
    print("not found")
type(x)

test[0] 1
test [[1 0 0]
 [0 0 0]]


numpy.ndarray

array(['', '', '', '', '', '', '', '', '', ''], dtype='<U1')

found


bool

In [92]:
car = {
  "brand": "Ford",
  "model": "Mustang",
  "year": 1964
}

x = car.items()

print(x)


dict_items([('brand', 'Ford'), ('model', 'Mustang'), ('year', 1964)])


In [102]:
import pandas as pd

text_df.to_csv("/Users/mac/Desktop/NZ/UOA/COMPSCI_361/assignments/assignment_05/test.csv")

type(text_df["abstract"])
from collections import Counter

In [134]:
my_string = "blah, lots  ,  of ,  spaces, here "
result = [x.strip() for x in my_string.split(',')]
result

['blah', 'lots', 'of', 'spaces', 'here']

In [341]:
import numpy as np 
import pandas as pd

text_df1 = pd.read_csv("./trg_copy.csv")
words = []

for i in range(0, text_df1.shape[0]):
    abstract = text_df1["abstract"][i]
    words_abstract = abstract.split(" ") # Split when there is a space
    words_abstract = [x.strip() for x in words_abstract]
    words_abstract = [ x for x in words_abstract if x != ""]
    
    words.extend(words_abstract)

#words = list(np.unique(np.sort(words)))    
words = list(np.sort(np.unique(words)))
words
#text_df1["abstract"]

["'",
 "'87",
 "'a",
 "'a'",
 "'addiction'",
 "'ancient'",
 "'b'",
 "'bird",
 "'birds",
 "'c3-type'",
 "'captive'",
 "'chaperonins'",
 "'classical'",
 "'core'",
 "'cysteine-rich'",
 "'deadh'",
 "'destabilization",
 "'dinosaur",
 "'ets",
 "'feral'",
 "'heat-shock'",
 "'hitch-hiking'",
 "'housekeeping'",
 "'immune'",
 "'inflammatory",
 "'insert'",
 "'inserted'",
 "'king",
 "'lcp30'",
 "'lego",
 "'lightning-strike",
 "'mdr-like'",
 "'motor",
 "'mr",
 "'neutralized'",
 "'occluding",
 "'open",
 "'open'",
 "'plasticity",
 "'processing'",
 "'pstaire'",
 "'shotgun'",
 "'silent'",
 "'tail'",
 "'tissue-specific'",
 "'transcriptional",
 "'ttp'",
 "'uptake'",
 "'wolf'",
 '-',
 '--',
 '-1',
 '-10',
 '-1055',
 '-11',
 '-116',
 '-124',
 '-126',
 '-130',
 '-13548',
 '-1363',
 '-13859',
 '-14',
 '-142',
 '-1421',
 '-147',
 '-149',
 '-14q31',
 '-15',
 '-1561',
 '-16',
 '-17',
 '-18',
 '-180',
 '-19',
 '-1a2',
 '-1a2-like',
 '-2',
 '-20',
 '-200',
 '-21',
 '-210',
 '-213',
 '-215',
 '-219',
 '-220',
 '-2

In [343]:
text_boolean_df = pd.DataFrame( np.zeros((text_df1.shape[0], len(words) + 4), dtype = int) , columns = words + ["zzA", "zzB", "zzE", "zzV"])
#text_boolean_df["class"] = text_df1["class"]
for index in range(0, 1):
    classValue = text_df1["class"][index]
    if classValue == "A":
        text_boolean_df["zzA"][index] = 1
    elif classValue == "B":
        text_boolean_df["zzB"][index] = 1
    elif classValue == "E":
        text_boolean_df["zzE"][index] = 1
    else:
        text_boolean_df["zzV"][index] = 1
    
#for i in range(0, text_df1.shape[0]):
for i in range(0, 1):    
    abstract = text_df1["abstract"][i]
    words_abstract = abstract.split(" ") # Split when there is a space
    words_abstract = [x.strip() for x in words_abstract]
    words_abstract = [ x for x in words_abstract if x != ""]
    #print("words_abstract ", words_abstract)
    for j in range(0, len(words_abstract)):
        word = words_abstract[j]
        #text_boolean_df[word][i] = 1
        text_boolean_df.loc[i, word] = 1
text_boolean_df.head()
#text_boolean_df.to_csv("./test_123.csv", index=False)

Unnamed: 0,','87,'a,'a','addiction','ancient','b','bird,'birds,'c3-type',...,zymogen,zymogen-type,zymogens,zymogram,zymography,zymomonas,zzA,zzB,zzE,zzV
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [323]:
#text_boolean_df.loc[i, "addiction"] = 1
#text_boolean_df.head()
#text_boolean_df["addiction"][0] = 1
#text_boolean_df.head()
#text_boolean_df["addiction"] = 1
#text_boolean_df["addiction"]
#word = words[4]
#print("word", word)
#text_boolean_df[word][1] = 1
#text_boolean_df.head()

#data123 = np.zeros((2, 3), dtype = int)
words.append("test")
data123 = np.zeros((text_df1.shape[0], len(words)), dtype = int)
df_1 = pd.DataFrame(data123, columns = words)


df_1.loc[0,"'addiction'"] = 1
df_1

#df = pd.DataFrame(data123, columns = words[])
#df["addition"][0] = 1
#df.at[1, "addition"] = 1
#df.head()

Unnamed: 0,','87,'a,'a','addiction','ancient','b','bird,'birds,'c3-type',...,zygomycetes,zygotic,zymogen,zymogen-type,zymogens,zymogram,zymography,zymomonas,test,test.1
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [454]:
fruits = ['apple', 'banana', 'cherry']

found = not ("cherry1" in fruits)
found
#type(found == )

True