# Abstract

# Preface

## Aim

To create a Bayes Learning model to predict the scientific abstract based on proteins present

## Method



## Results


## Conclusions

# Code

## Get all required imports

In [72]:
import pandas as pd
from pandas import DataFrame
import nltk

nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\liyor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\liyor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load Data

In [73]:
# abstractTrainingData (absTrainData)
absTrainData = pd.read_csv("data/trg.csv")
absTestData  = pd.read_csv("data/tst.csv")

absTrainData = absTrainData.drop(columns="id")
absTestData  = absTestData.drop(columns="id")


classDic = {"A": "Archaea", "B": "Bacteria", "E": "Eucaryota", "V": "Virus"}

classKeys = list(classDic)


# Check variables to see if its as expected
print(absTrainData)
print(absTestData)
print(classDic)
print(classKeys)


     class                                           abstract
0        B  the 4 202 353 bp genome of the alkaliphilic ba...
1        A  the complete 1751377-bp sequence of the genome...
2        E  in 1992 we started assembling an ordered libra...
3        E  the aim of this study is to measure human mito...
4        B  the amino acid sequence of the spirulina maxim...
...    ...                                                ...
3995     E  we have isolated and characterized two diureti...
3996     E  myotonias are muscle diseases in which the fun...
3997     E  cysteine synthase o-acetylserine sulfhydrylase...
3998     E  a region of 25 nucleotides is highly conserved...
3999     B  thermoanaerobacter tengcongensis is a rod-shap...

[4000 rows x 2 columns]
                                              abstract
0    in a previous work all three components of com...
1    we compared morphology of two geographically c...
2    factor xiii mr 320000 is a blood coagulation f...
3    we rep

## Break Down abstract 

Break Down abstract into seperate words so we can analyse the frequency of words. First we will find all the words present.

I will use the NLTK library to do this. From the NLTK library I will be using the tokenizing funtion (break down each word) and removing stopping words using the list of words from the library and finally counting the number of words using the FreqDist Class.

In [86]:
allWords = ""

for index in range(len(absTrainData)):
  abstract = absTrainData["abstract"][index]
  # print(type(abstract))
  allWords += "\n" + abstract



# print(allWords)

allWordsTokens = word_tokenize(allWords)

# print("Tokens before filter: ")

# print(*allWordsTokens, sep= "\n")

filteredWords = [word for word in allWordsTokens if word.casefold() not in stop_words]
print(len(filteredWords))

# print("Tokens after filter: ")
# print(*filteredWords, sep= "\n")

uniqueWords = set(filteredWords)
print(len(uniqueWords))

frequncyDistribution = FreqDist(filteredWords)
print(len(frequncyDistribution))
for word in frequncyDistribution.items():
  print(word[1])

452111
31997
31997
262
15
15
975
3729
32
411
321
42
63
1187
17
1042
3496
873
2431
72
12
20
421
34
13
83
872
228
743
14
112
546
36
165
548
310
35
2519
1384
244
12
60
543
284
286
799
24
1211
14
34
6401
308
24
56
528
272
503
161
5098
351
1250
79
483
61
928
94
106
57
15
14
12
32
59
229
27
15
24
1186
12
52
19
35
166
89
356
69
33
876
194
412
361
956
23
170
102
44
87
1129
17
5185
86
68
23
74
20
1009
87
107
545
164
17
851
897
473
574
1645
198
483
266
21
84
265
705
425
345
221
20
87
17
792
18
115
100
759
134
197
167
17
17
17
70
17
417
78
850
416
251
851
612
122
42
125
550
17
255
248
106
2589
36
42
95
190
548
212
221
58
49
395
88
575
359
19
112
22
135
17
275
126
765
44
392
157
51
25
237
365
135
79
338
113
1993
510
233
644
371
351
158
596
328
113
315
355
72
28
17
95
324
119
17
57
32
42
133
336
184
1349
17
534
18
1141
19
491
24
280
541
1028
88
41
17
60
127
122
51
64
18
649
303
36
429
725
340
540
428
375
251
18
691
19
105
202
110
80
313
39
136
103
273
227
29
18
168
30
17
281
29
150
369
54
268
1166


## Creating the functions

In [91]:
class BayesianModel:
    

    def __init__(self, training_data: DataFrame, class_variables) -> None:
        self.class_variables = class_variables
        self.training_data = training_data
        self.posterior_probability = {}
        self.word_frequncy = {}
        for cls in class_variables:
            self.posterior_probability[cls] = 0
            self.word_frequncy[cls] = {}
        self.needLaplaceSmoothing = False


    def find_all_words(self):
        data = self.training_data
        for cls in self.class_variables:
            wordsInClass = ""
            for index in range(len(data)):
                if cls == data["class"][index]:
                    abstract = data["abstract"][index]
                    wordsInClass += "\n" + abstract
            tokensInClass = word_tokenize(wordsInClass)
            filteredInClass = [word for word in tokensInClass if word.casefold() not in stop_words]
            # totalTokensInClass = len(filteredInClass)
            frequncyInClass = FreqDist(filteredInClass)
            for word in frequncyInClass.items():
                self.word_frequncy[cls][word[0]] = word[1]
            print(self.word_frequncy[cls])


        # data = self.training_data
        # allWords = ""
        # for index in range(len(data)):
        #     abstract = data["abstract"][index]
        #     allWords += "\n" + abstract
        
        # allTokens = word_tokenize(allWords)
        # filteredWords = [word for word in allTokens if word.casefold() not in stop_words]

        # uniqueWords = set(filteredWords)


        # self.needLaplaceSmoothing = self.need_laplace(uniqueWords)

        # wordFrequncy = FreqDist(filteredWords)
        
    def need_laplace(self, uniqueWords):
        data = self.training_data

        for cls in self.class_variables:
            wordsInClass = ""
            for index in range(len(data)):
                if cls == data["class"][index]:
                    abstract = data["abstract"][index]
                    wordsInClass += "\n" + abstract
            tokensInClass = word_tokenize(wordsInClass)
            filteredInClass = [word for word in tokensInClass if word.casefold() not in stop_words]
            uniqueInClass = set(filteredInClass)
            if len(uniqueInClass) != len(uniqueWords):
                return True

        return False

    def test_model(self, test_data) -> list:
        results = []


        return results

    def calculate_posterior_probabilities(self):
        data = self.training_data
        total = len(data)
        for cls in self.class_variables:
            self.posterior_probability[cls] = len(data[data['class'] == cls])/total
        print(self.posterior_probability)

    def train(self):
        pass

BM = BayesianModel(absTrainData, classKeys)

BM.calculate_posterior_probabilities()
BM.find_all_words()




{'A': 0.032, 'B': 0.4005, 'E': 0.536, 'V': 0.0315}
{'complete': 93, '1751377-bp': 17, 'sequence': 209, 'genome': 283, 'thermophilic': 23, 'archaeon': 64, 'methanobacterium': 22, 'thermoautotrophicum': 74, 'deltah': 20, 'determined': 65, 'whole-genome': 30, 'shotgun': 26, 'sequencing': 43, 'approach': 23, 'total': 60, '1855': 17, 'open': 72, 'reading': 73, 'frames': 65, 'orfs': 198, 'identified': 100, 'appear': 23, 'encode': 60, 'polypeptides': 103, '844': 17, '46': 28, 'assigned': 89, 'putative': 69, 'functions': 79, 'based': 54, 'similarities': 25, 'database': 20, 'sequences': 232, '514': 17, '28': 17, 'orf-encoded': 17, 'related': 92, 'unknown': 35, '496': 17, '27': 19, 'little': 34, 'homology': 20, 'public': 33, 'databases': 63, 'comparisons': 45, 'eucarya-': 17, 'bacteria-': 17, 'archaea-specific': 17, 'reveal': 17, '1013': 17, 'gene': 234, 'products': 58, '54': 17, 'similar': 105, 'polypeptide': 19, 'described': 19, 'previously': 28, 'organisms': 55, 'domain': 37, 'archaea': 81, '