In [1]:
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

stop_words = set(stopwords.words('english'))

In [2]:
text = ''
fileNames = ['Automated Bird Species Identification using Audio Signal Processing and Neural Networks', 
             'BIRD SOUNDS CLASSIFICATION BY LARGE SCALE ACOUSTIC FEATURES AND EXTREME MACHINE LEARNING',
            'Identification Vehicle Movement Detection in Forest Area using MFCC and KNN',
            'WAVELET TRANSFORM DIGITAL SOUND PROCESSING TO IDENTIFY WILD BIRD SPECIES']
for name in fileNames:
    fileObj = open('Dataset\\PlagarisgmPDFFiles\\'+name+'.pdf','rb')
    pdf = PyPDF2.PdfFileReader(fileObj)
    numOfPages = pdf.getNumPages()
    for i in range(0, numOfPages):
        page = pdf.getPage(i)
        text = ' '.join((text,page.extractText()))
    fileObj.close()

In [3]:
tokenized_sentences = nltk.tokenize.sent_tokenize(text)
sentences = []
for sentence in tokenized_sentences:
    sentences.append(sentence.rstrip('\r\n'))

print(f"Total number of sentences in the corpus: {len(sentences)}")

Total number of sentences in the corpus: 352


In [4]:
tokenized_words = []
for sent in sentences:
    words = nltk.word_tokenize(sent)
    for word in words:
        tokenized_words.append(word)

print(f"Total number of tokens in the corpus: {len(tokenized_words)}")

Total number of tokens in the corpus: 8222


In [5]:
filtered_tokens = [w for w in tokenized_words if not w in stop_words]

In [6]:
## stemming the corpus
stem = LancasterStemmer()
stem_words = []
for word in tokenized_words:
    stem_words.append(stem.stem(word))

print(stem_words)

['autom', 'bird', 'specy', 'id', 'us', 'audio', 'sign', 'process', 'and', 'neur', 'network', 'chandu', 'b', ',', 'akash', 'munikot', ',', 'karthik', 's', 'murthy', ',', 'ganesh', 'murthy', 'v', ',', 'chaitr', 'nagaras', 'bnm', 'institut', 'of', 'technolog', ',', 'electron', 'and', 'commun', 'depart', 'bang', '-', '560070', ',', 'ind', 'chandravardhanb', '@', 'gmail.com', ',', 'akashmunikot', '@', 'gmail.com', 'karthikksm97', '@', 'gmail.com', ',', 'ganeshmurthy2027', '@', 'gmail.com', 'chaitranagras', '@', 'gmail.com', 'abstract', '-', 'in', 'thi', 'pap', ',', 'an', 'au', 'tom', 'bird', 'specy', 'recognit', 'system', 'has', 'been', 'develop', 'and', 'method', 'for', 'their', 'id', 'has', 'been', 'investig', '.', 'autom', 'id', 'of', 'bird', 'sound', 'without', 'phys', 'interv', 'has', 'been', 'a', 'formid', 'and', 'on', 'endeav', 'for', 'sign', 'research', 'on', 'the', 'taxonom', 'and', 'vary', 'oth', 'sub', 'field', 'of', 'ornitholog', '.', 'in', 'thi', 'pap', ',', 'a', 'two-stage', '

In [7]:
lemmatizer = WordNetLemmatizer()
lemma_words = []
for word in tokenized_words:
    lemma_words.append(lemmatizer.lemmatize(word))

print(lemma_words)

['Automated', 'Bird', 'Species', 'Identification', 'using', 'Audio', 'Signal', 'Processing', 'and', 'Neural', 'Networks', 'Chandu', 'B', ',', 'Akash', 'Munikoti', ',', 'Karthik', 'S', 'Murthy', ',', 'Ganesh', 'Murthy', 'V', ',', 'Chaitra', 'Nagaraj', 'BNM', 'Institute', 'of', 'Technology', ',', 'Electronics', 'and', 'Communication', 'Department', 'Bangalore', '-', '560070', ',', 'India', 'chandravardhanb', '@', 'gmail.com', ',', 'akashmunikoti', '@', 'gmail.com', 'karthikksm97', '@', 'gmail.com', ',', 'ganeshmurthy2027', '@', 'gmail.com', 'chaitranagraj', '@', 'gmail.com', 'Abstract', '-', 'In', 'this', 'paper', ',', 'an', 'au', 'tomatic', 'bird', 'specie', 'recognition', 'system', 'ha', 'been', 'developed', 'and', 'method', 'for', 'their', 'identification', 'ha', 'been', 'investigated', '.', 'Automatic', 'identification', 'of', 'bird', 'sound', 'without', 'physical', 'intervention', 'ha', 'been', 'a', 'formidable', 'and', 'onerous', 'endeavor', 'for', 'significant', 'research', 'on', 

In [8]:
# fit method vectorizes all the unique words in the corpus
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
vect_tfid=tfidf_vectorizer.fit(lemma_words)

In [9]:
vocab_tfid = vect_tfid.vocabulary_
for key in sorted(vocab_tfid.keys()):
    print("{}:{}".format(key, vocab_tfid[key]))

00:0
000:1
0040505:2
03:3
04:4
0480:5
05:6
05khz:7
08:8
09:9
10:10
100:11
101:12
102114:13
103:14
1069:15
1080:16
1082:17
1090:18
11:19
1109:20
1157:21
1163:22
117:23
1182:24
12:25
120:26
122:27
123:28
1231m:29
124:30
128:31
1285:32
128kbps:33
128x:34
12max:35
13:36
1302:37
1318authorized:38
1319authorized:39
1320authorized:40
1321authorized:41
137:42
13th:43
14:44
141:45
1459:46
1462:47
15:48
1541:49
1550:50
158:51
1594:52
159iii:53
16:54
160:55
161:56
162:57
163:58
164:59
17:60
18:61
19:62
1997:63
1998:64
1abstractšaudio:65
1khzandtheothersareat22:66
1mitikachaudhary396:67
1to1:68
1w:69
20:70
2002:71
2003:72
2004:73
2006:74
2007:75
2008:76
2009:77
201:78
2010:79
2011:80
2012:81
2013:82
2014:83
2015:84
2016:85
2020:86
203:87
206:88
2093:89
21:90
2185:91
2196:92
22:93
221:94
2252:95
2263:96
227:97
2277:98
23:99
24:100
25:101
26:102
27:103
2740:104
2748:105
2763:106
2766:107
28:108
29:109
2974:110
2984:111
2machinelearninggroup:112
2seconds:113
2vinayvaish:114
30:115
300:116
3089:117
31

thewholenumberofattributesperfeature:1939
theworkby:1940
theworksabovearemainlyfocusedonalimitedscale:1941
they:1942
theycompareddifferentclassiﬁers:1943
theyintroducedamethodbased:1944
theyutilisedbackpropagationand:1945
thezero:1946
theﬁeldnoiseand:1947
this:1948
thisfastandaccuratemethod:1949
thisisespeciallyimportantinourcasewheretheclass:1950
thisisnotabigdatabasefor:1951
thisisthestandardmeasureoftheinterspeech:1952
thiskindofstatistic:1953
thismethodwasnotsuitable:1954
thispaper:1955
thisworkissupportedbychinascholarshipcouncil:1956
thisyear:1957
those:1958
thousand:1959
threat:1960
threatened:1961
three:1962
threshold:1963
thresholding:1964
through:1965
throughout:1966
thus:1967
thusallowingit:1968
tiger:1969
tilsenandk:1970
timber:1971
time:1972
timely:1973
timewarpingandhiddenmarkovmodels:1974
tionandsegmentationofsyllables:1975
tionforbirdsongphraseclassiﬁcationusinglimitedtraining:1976
tionsonsignalprocessing:1977
tmu:1978
to:1979
to83:1980
to93:1981
to95:1982
toasupervised