# Abstract

# Preface

## Aim

To create a Bayes Learning model to predict the scientific abstract based on proteins present

## Method



## Results


## Conclusions

# Code

## Get all required imports

In [3]:
import pandas as pd
from pandas import DataFrame
import nltk
from math import log

nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\liyor\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\liyor\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Load Data

In [4]:
# abstractTrainingData (absTrainData)
absTrainData = pd.read_csv("data/trg.csv")
absTestData  = pd.read_csv("data/tst.csv")

absTrainData = absTrainData.drop(columns="id")
absTestData  = absTestData.drop(columns="id")


classDic = {"A": "Archaea", "B": "Bacteria", "E": "Eucaryota", "V": "Virus"}

classKeys = list(classDic)


# Check variables to see if its as expected
print(absTrainData)
print(absTestData)
print(classDic)
print(classKeys)


     class                                           abstract
0        B  the 4 202 353 bp genome of the alkaliphilic ba...
1        A  the complete 1751377-bp sequence of the genome...
2        E  in 1992 we started assembling an ordered libra...
3        E  the aim of this study is to measure human mito...
4        B  the amino acid sequence of the spirulina maxim...
...    ...                                                ...
3995     E  we have isolated and characterized two diureti...
3996     E  myotonias are muscle diseases in which the fun...
3997     E  cysteine synthase o-acetylserine sulfhydrylase...
3998     E  a region of 25 nucleotides is highly conserved...
3999     B  thermoanaerobacter tengcongensis is a rod-shap...

[4000 rows x 2 columns]
                                              abstract
0    in a previous work all three components of com...
1    we compared morphology of two geographically c...
2    factor xiii mr 320000 is a blood coagulation f...
3    we rep

## Break Down abstract 

Break Down abstract into seperate words so we can analyse the frequency of words. First we will find all the words present.

I will use the NLTK library to do this. From the NLTK library I will be using the tokenizing funtion (break down each word) and removing stopping words using the list of words from the library and finally counting the number of words using the FreqDist Class.

In [5]:
allWords = ""

for index in range(len(absTrainData)):
  abstract = absTrainData["abstract"][index]
  # print(type(abstract))
  allWords += "\n" + abstract



# print(allWords)

allWordsTokens = word_tokenize(allWords)

# print("Tokens before filter: ")

# print(*allWordsTokens, sep= "\n")

filteredWords = [word for word in allWordsTokens if word.casefold() not in stop_words]
print(len(filteredWords))

# print("Tokens after filter: ")
# print(*filteredWords, sep= "\n")

uniqueWords = set(filteredWords)
print(len(uniqueWords))

frequncyDistribution = FreqDist(filteredWords)
print(len(frequncyDistribution))
# for word in frequncyDistribution.items():
#   print(word[1])

452111
31997
31997


## Creating the functions

In [12]:



class BayesianModel:
    

    def __init__(self, training_data: DataFrame, class_variables) -> None:
        self.class_variables = class_variables
        self.training_data = training_data

        self.posterior_probability = {}
        self.word_frequncy = {}
        self.class_totals = {}
        for cls in class_variables:
            self.posterior_probability[cls] = 0
            self.word_frequncy[cls] = {}
            self.class_totals[cls] = 0
        self.needLaplaceSmoothing = False
        self.uniqueWords = set()
        self.calculate_posterior_probabilities()
        self.count_words()


    def count_words(self):
        data = self.training_data
        
        for cls in self.class_variables:
            wordsInClass = ""
            for index in range(len(data)):
                if cls == data["class"][index]:
                    abstract = data["abstract"][index]
                    wordsInClass += "\n" + abstract
            tokensInClass = word_tokenize(wordsInClass)
            filteredInClass = [word for word in tokensInClass if word.casefold() not in stop_words]
            totalTokensInClass = len(filteredInClass)
            frequncyInClass = FreqDist(filteredInClass)
            for word in frequncyInClass.items():
                self.uniqueWords.add(word)
                self.word_frequncy[cls][word[0]] = word[1] + 1
                self.class_totals[cls] = totalTokensInClass
            print(self.word_frequncy[cls])


        # data = self.training_data
        # allWords = ""
        # for index in range(len(data)):
        #     abstract = data["abstract"][index]
        #     allWords += "\n" + abstract
        
        # allTokens = word_tokenize(allWords)
        # filteredWords = [word for word in allTokens if word.casefold() not in stop_words]

        # uniqueWords = set(filteredWords)


        # self.needLaplaceSmoothing = self.need_laplace(uniqueWords)

        # wordFrequncy = FreqDist(filteredWords)
        
    def need_laplace(self, uniqueWords):
        data = self.training_data

        for cls in self.class_variables:
            wordsInClass = ""
            for index in range(len(data)):
                if cls == data["class"][index]:
                    abstract = data["abstract"][index]
                    wordsInClass += "\n" + abstract
            tokensInClass = word_tokenize(wordsInClass)
            filteredInClass = [word for word in tokensInClass if word.casefold() not in stop_words]
            uniqueInClass = set(filteredInClass)
            if len(uniqueInClass) != len(uniqueWords):
                return True

        return False

    def test(self, test_data: DataFrame) -> list:
        # result list to return
        results = []

        # go through each abstract in the test_data seperately
        for index in range(len(test_data)):

            # copy the dictionary --> previously took the point instead
            cls_p = self.posterior_probability.copy()

            # get abstract and tokenise and remove stop words
            abstract = test_data["abstract"][index]
            tokens = word_tokenize(abstract)
            filtered = [word for word in tokens if word.casefold() not in stop_words]
            freq_filtered = FreqDist(filtered)

            # get the unique words from the filtered list
            set_of_filtered = set(filtered)

            # use union operation to combine sets to find total new words for laplace smoothing
            all_words = set_of_filtered.union(self.uniqueWords)
            numOfUniqueWords = len(all_words)



            # always do a laplace smoothing as it only makes things better
            for cls in self.class_variables:
                cls_p[cls] = log(cls_p[cls])
                cls_t = self.class_totals.copy()
                cls_t[cls] += (len(cls_p)) * numOfUniqueWords
                word_frequncy:dict = self.word_frequncy[cls].copy()
                for word in set(filtered).difference(set(list(word_frequncy.keys()))):
                        word_frequncy[word] = 1

                # could make this faster with the frequncy dist and then add it that many times instead of looping the entire list
                for w_n in freq_filtered.items():
                    cls_p[cls] += w_n[1] * (log(word_frequncy[w_n[0]]/cls_t[cls]))
                

            max_prob = list(cls_p.keys())[0]
            for cls in list(cls_p.keys()):
                if cls_p[cls] > cls_p[max_prob]:
                    max_prob = cls
            results.append(max_prob)

        return results

    def calculate_posterior_probabilities(self):
        data = self.training_data
        total = len(data)
        for cls in self.class_variables:
            self.posterior_probability[cls] = len(data[data['class'] == cls])/total
        print(self.posterior_probability)

    def train(self):
        pass


## Test with some data so we can validate model

In [10]:
# Train with some data

def single_split_validation(
    data_set: DataFrame,
    training_validation_split=0.5,
):
    training_data = data_set.groupby('class', group_keys=False).apply(lambda x: x.sample(frac=training_validation_split, random_state= 200))
    validation_data = data_set.drop(training_data.index)
    training_data = training_data.reset_index(drop=True)
    validation_data = validation_data.reset_index(drop=True)
    BM = BayesianModel(training_data, classKeys)
    predictions = BM.test(validation_data)
    total = len(validation_data)
    correct = 0
    for i in range(len(validation_data)):
        if predictions[i] == validation_data['class'][i]:
            correct+=1
    print(f'{correct/total}')

# 90/10 split
single_split_validation(absTrainData, training_validation_split= 0.9)

# 70/30 split
single_split_validation(absTrainData, training_validation_split= 0.7)

# 50/50 split
single_split_validation(absTrainData, training_validation_split= 0.5)

  training_data = data_set.groupby('class', group_keys=False).apply(lambda x: x.sample(frac=training_validation_split, random_state= 200))


{'A': 0.03194444444444444, 'B': 0.40055555555555555, 'E': 0.5361111111111111, 'V': 0.03138888888888889}
{'complete': 88, 'sequence': 190, 'genome': 255, 'aerobic': 13, 'hyper-thermophilic': 13, 'crenarchaeon': 13, 'aeropyrum': 7, 'pernix': 7, 'k1': 7, 'optimally': 8, 'grows': 8, '95': 6, 'degrees': 19, 'c': 22, 'determined': 59, 'whole': 31, 'shotgun': 24, 'method': 9, 'modifications': 8, 'entire': 25, 'length': 14, '1669695': 6, 'bp': 20, 'authenticity': 13, 'supported': 13, 'restriction': 14, 'analysis': 32, 'long': 25, 'pcr': 20, 'products': 55, 'directly': 15, 'amplified': 14, 'genomic': 58, 'dna': 116, 'potential': 24, 'protein-coding': 31, 'regions': 35, 'total': 59, '2694': 6, 'open': 64, 'reading': 65, 'frames': 60, 'orfs': 182, 'assigned': 86, 'similarity': 53, 'search': 22, 'public': 31, 'databases': 60, '633': 6, '235': 6, 'related': 85, 'genes': 243, 'putative': 66, 'function': 31, '523': 6, '194': 6, 'sequences': 208, 'registered': 15, 'unknown': 33, 'tca': 8, 'cycle': 24,

  training_data = data_set.groupby('class', group_keys=False).apply(lambda x: x.sample(frac=training_validation_split, random_state= 200))


{'entire': 58, 'genome': 1904, 'bacterium': 290, 'mycoplasma': 55, 'pneumoniae': 135, 'm129': 5, 'sequenced': 401, 'size': 131, '816394': 5, 'base': 245, 'pairs': 236, 'average': 37, 'gc': 106, 'content': 181, '400': 7, 'mol': 17, 'predict': 20, '677': 5, 'open': 269, 'reading': 281, 'frames': 187, 'orfs': 173, '39': 26, 'genes': 2564, 'coding': 256, 'various': 94, 'rna': 64, 'species': 344, 'predicted': 367, '759': 6, 'showed': 232, 'significant': 162, 'similarity': 278, 'genesproteins': 11, 'organisms': 105, '99': 15, 'reveal': 13, 'gene': 1396, 'sequences': 552, 'databases': 31, 'permitted': 6, 'us': 20, 'tentatively': 7, 'assign': 11, 'functional': 102, 'classification': 10, 'large': 229, 'number': 171, 'deduce': 6, 'biochemical': 20, 'physiological': 15, 'properties': 38, 'reduction': 37, 'reductive': 20, 'evolution': 106, 'ancestral': 27, 'bacteria': 383, 'explained': 7, 'loss': 74, 'complete': 463, 'anabolic': 6, 'eg': 28, 'amino': 566, 'acid': 487, 'synthesis': 104, 'metabolic'

  training_data = data_set.groupby('class', group_keys=False).apply(lambda x: x.sample(frac=training_validation_split, random_state= 200))


{'entire': 45, 'genome': 1373, 'bacterium': 205, 'mycoplasma': 47, 'pneumoniae': 87, 'm129': 4, 'sequenced': 283, 'size': 94, '816394': 4, 'base': 178, 'pairs': 170, 'average': 26, 'gc': 84, 'content': 133, '400': 5, 'mol': 12, 'predict': 17, '677': 4, 'open': 198, 'reading': 207, 'frames': 141, 'orfs': 129, '39': 17, 'genes': 1806, 'coding': 182, 'various': 65, 'rna': 47, 'species': 249, 'predicted': 276, '759': 4, 'showed': 171, 'significant': 115, 'similarity': 205, 'genesproteins': 8, 'organisms': 71, '99': 10, 'reveal': 9, 'gene': 1036, 'sequences': 395, 'databases': 24, 'permitted': 5, 'us': 11, 'tentatively': 6, 'assign': 8, 'functional': 74, 'classification': 6, 'large': 164, 'number': 127, 'deduce': 5, 'biochemical': 16, 'physiological': 14, 'properties': 28, 'reduction': 28, 'reductive': 13, 'evolution': 80, 'ancestral': 21, 'bacteria': 291, 'explained': 6, 'loss': 49, 'complete': 326, 'anabolic': 4, 'eg': 23, 'amino': 443, 'acid': 376, 'synthesis': 76, 'metabolic': 101, 'pat

## Test with all data once found best hyperparameters

In [13]:
# Train with all data
BM = BayesianModel(absTrainData, classKeys)

predictions = BM.test(absTestData)

results = pd.read_csv("results/946prediction.csv")
total = len(results)
correct =0
for i in range(len(results)):
    if predictions[i] == results['class'][i]:
        correct += 1
print(f"{correct/total}")

file = open("results/predictions.csv", "w")

s = "class,id\n"

id = 1
for p in predictions:
    s+=f"{p},{id}\n"
    id+=1

file.write(s)
file.close()

{'A': 0.032, 'B': 0.4005, 'E': 0.536, 'V': 0.0315}
{'complete': 94, '1751377-bp': 18, 'sequence': 210, 'genome': 284, 'thermophilic': 24, 'archaeon': 65, 'methanobacterium': 23, 'thermoautotrophicum': 75, 'deltah': 21, 'determined': 66, 'whole-genome': 31, 'shotgun': 27, 'sequencing': 44, 'approach': 24, 'total': 61, '1855': 18, 'open': 73, 'reading': 74, 'frames': 66, 'orfs': 199, 'identified': 101, 'appear': 24, 'encode': 61, 'polypeptides': 104, '844': 18, '46': 29, 'assigned': 90, 'putative': 70, 'functions': 80, 'based': 55, 'similarities': 26, 'database': 21, 'sequences': 233, '514': 18, '28': 18, 'orf-encoded': 18, 'related': 93, 'unknown': 36, '496': 18, '27': 20, 'little': 35, 'homology': 21, 'public': 34, 'databases': 64, 'comparisons': 46, 'eucarya-': 18, 'bacteria-': 18, 'archaea-specific': 18, 'reveal': 18, '1013': 18, 'gene': 235, 'products': 59, '54': 18, 'similar': 106, 'polypeptide': 20, 'described': 20, 'previously': 29, 'organisms': 56, 'domain': 38, 'archaea': 82, '