<a href="https://colab.research.google.com/github/nasa-petal/data-collection-and-prep/blob/main/SVM_with_golden.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:

np.random.seed(500)

import json

with open("golden.json", "r") as read_file:
    papers = json.load(read_file)

In [None]:
Corpus = pd.DataFrame(columns = ['text', 'label'])
print(Corpus)

Empty DataFrame
Columns: [text, label]
Index: []


In [None]:
for paper in papers: 
  Corpus.loc[len(Corpus.index)] = [paper['abstract'], paper['isBiomimicry']]

In [None]:
Corpus['label'][0]

'Y'

In [None]:
papers[0]

{'paper': 'W2103410568',
 'mesh_terms': ['Anura',
  'Nesting Behavior',
  'Animals',
  'Anura',
  'Female',
  'Larva',
  'Larva',
  'Lung',
  'Lung',
  'Male',
  'Nesting Behavior',
  'Ovum',
  'Ovum',
  'Viscoelastic Substances'],
 'venue_ids': ['V153317304'],
 'venue_names': ['Europe PMC',
  'Biology Letters',
  "Weird Nature: An Astonishing Exploration of Nature's Strangest Behavior",
  'Proceedings of The Royal Society B: Biological Sciences'],
 'author_ids': ['A2346835213', 'A2098042950'],
 'author_names': ['Laura Dalgetty', 'Malcolm W. Kennedy'],
 'reference_ids': ['W2159311519',
  'W2038086748',
  'W2130285640',
  'W2271809641',
  'W2047714064',
  'W1794681095',
  'W2054319467',
  'W1991801038',
  'W2146144564',
  'W2083811528',
  'W2066345165',
  'W2101222182'],
 'title': 'Building a home from foam—túngara frog foam nest architecture and three-phase construction process',
 'abstract': "frogs that build foam nests floating on water face the problems of over-dispersion of the sec

In [None]:
Corpus['text'][0]

"frogs that build foam nests floating on water face the problems of over-dispersion of the secretions used and eggs being dangerously exposed at the foam : air interface. nest construction behaviour of tungara frogs, engystomops pustulosus, has features that may circumvent these problems. pairs build nests in periodic bursts of foam production and egg deposition, three discrete phases being discernible. the first is characterized by a bubble raft without egg deposition and an approximately linear increase in duration of mixing events with time. this phase may reduce initial over-dispersion of foam precursor materials until a critical concentration is achieved. the main building phase is marked by mixing events and start-to-start intervals being nearly constant in duration. during the final phase, mixing events do not change in duration but intervals between them increase in an exponential-like fashion. pairs joining a colonial nesting abbreviate their initial phase, presumably by explo

In [None]:
# Step - a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['text'] = [entry.lower() for entry in Corpus['text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    Corpus.loc[index,'text_final'] = str(Final_words)

In [10]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)

In [11]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [12]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
 
print(Tfidf_vect.vocabulary_)
print(Train_X_Tfidf)


{'frog': 1878, 'build': 591, 'foam': 1819, 'nest': 2987, 'water': 4924, 'face': 1700, 'problem': 3523, 'secretion': 4004, 'use': 4811, 'expose': 1671, 'air': 152, 'interface': 2369, 'construction': 961, 'behaviour': 466, 'feature': 1735, 'may': 2706, 'pair': 3213, 'periodic': 3295, 'burst': 598, 'production': 3531, 'egg': 1448, 'deposition': 1204, 'three': 4566, 'discrete': 1296, 'phase': 3323, 'first': 1781, 'characterize': 709, 'bubble': 584, 'without': 4967, 'approximately': 300, 'linear': 2576, 'increase': 2251, 'duration': 1396, 'mix': 2823, 'event': 1596, 'time': 4587, 'reduce': 3722, 'initial': 2310, 'precursor': 3465, 'material': 2693, 'critical': 1062, 'concentration': 901, 'achieve': 56, 'main': 2647, 'building': 592, 'mark': 2679, 'interval': 2387, 'nearly': 2967, 'constant': 954, 'final': 1770, 'change': 702, 'fashion': 1716, 'join': 2449, 'presumably': 3497, 'exploit': 1663, 'thereby': 4543, 'energy': 1524, 'expenditure': 1651, 'predator': 3467, 'finally': 1771, 'deposit':

In [13]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  88.84758364312268


In [14]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  91.17945251774248
