In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating biryani and you are eating grapes"
]

In [2]:
#let's create the vectorizer and fit the corpus and transform them accordingly
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)

In [3]:

#let's print the vocabulary

print(v.vocabulary_)

{'thor': 25, 'eating': 10, 'pizza': 22, 'loki': 17, 'is': 16, 'ironman': 15, 'ate': 7, 'already': 0, 'apple': 5, 'announcing': 4, 'new': 20, 'iphone': 14, 'tomorrow': 26, 'tesla': 24, 'model': 19, 'google': 12, 'pixel': 21, 'microsoft': 18, 'surface': 23, 'amazon': 2, 'eco': 11, 'dot': 9, 'am': 1, 'biryani': 8, 'and': 3, 'you': 27, 'are': 6, 'grapes': 13}


In [4]:
#let's print the idf of each word:

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
    
    #let's get the index in the vocabulary
    indx = v.vocabulary_.get(word)
    
    #get the score
    idf_score = v.idf_[indx]
    
    print(f"{word} : {idf_score}")

already : 2.386294361119891
am : 2.386294361119891
amazon : 2.386294361119891
and : 2.386294361119891
announcing : 1.2876820724517808
apple : 2.386294361119891
are : 2.386294361119891
ate : 2.386294361119891
biryani : 2.386294361119891
dot : 2.386294361119891
eating : 1.9808292530117262
eco : 2.386294361119891
google : 2.386294361119891
grapes : 2.386294361119891
iphone : 2.386294361119891
ironman : 2.386294361119891
is : 1.1335313926245225
loki : 2.386294361119891
microsoft : 2.386294361119891
model : 2.386294361119891
new : 1.2876820724517808
pixel : 2.386294361119891
pizza : 2.386294361119891
surface : 2.386294361119891
tesla : 2.386294361119891
thor : 2.386294361119891
tomorrow : 1.2876820724517808
you : 2.386294361119891


In [5]:
#let's print the transformed output from tf-idf
print(transform_output.toarray())

[[0.24266547 0.         0.         0.         0.         0.
  0.         0.24266547 0.         0.         0.40286636 0.
  0.         0.         0.         0.24266547 0.11527033 0.24266547
  0.         0.         0.         0.         0.72799642 0.
  0.         0.24266547 0.         0.        ]
 [0.         0.         0.         0.         0.30652086 0.5680354
  0.         0.         0.         0.         0.         0.
  0.         0.         0.5680354  0.         0.26982671 0.
  0.         0.         0.30652086 0.         0.         0.
  0.         0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.26982671 0.
  0.         0.5680354  0.30652086 0.         0.         0.
  0.5680354  0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.

In [6]:
import pandas as pd

#read the data into a pandas dataframe
df = pd.read_csv("bert_data_withoutClassWord.csv")
print(df.shape)
df.head(5)

(7115, 3)


Unnamed: 0.1,Unnamed: 0,text,class
0,0,this article is about the herbivorous mammals....,Antelope
1,1,"one new world species, the pronghorn of north ...",Antelope
2,2,"the english word ""animal"" first appeared in 14...",Antelope
3,3,"the word talopus and calopus, from latin, came...",Antelope
4,4,animal are not a cladistic or taxonomically de...,Antelope


In [7]:
#check the distribution of labels 
df['class'].unique()

array(['Antelope', 'grizzly+bear', 'killer+whale', 'beaver', 'dalmatian',
       'persian+cat', 'horse', 'german+shepherd', 'blue+whale',
       'siamese+cat', 'skunk', 'mole', 'tiger', 'hippopotamus', 'leopard',
       'moose', 'spider+monkey', 'humpback+whale', 'elephant', 'gorilla',
       'ox', 'fox', 'sheep', 'seal', 'chimpanzee', 'hamster', 'squirrel',
       'rhinoceros', 'rabbit', 'bat', 'giraffe', 'wolf', 'chihuahua',
       'rat', 'weasel', 'otter', 'buffalo', 'zebra', 'giant+panda',
       'deer', 'bobcat', 'pig', 'lion', 'mouse', 'polar+bear', 'collie',
       'Walrus', 'raccoon', 'cow', 'dolphin'], dtype=object)

In [8]:
#Add the new column which gives a unique number to each of these labels 

j = 0
for i in df['class'].unique():
    df.loc[df['class'] == i, ['class_num']] = j
    j += 1

#checking the results 
df.head(50000)

Unnamed: 0.1,Unnamed: 0,text,class,class_num
0,0,this article is about the herbivorous mammals....,Antelope,0.0
1,1,"one new world species, the pronghorn of north ...",Antelope,0.0
2,2,"the english word ""animal"" first appeared in 14...",Antelope,0.0
3,3,"the word talopus and calopus, from latin, came...",Antelope,0.0
4,4,animal are not a cladistic or taxonomically de...,Antelope,0.0
...,...,...,...,...
7110,7110,technology to use sponges as mouth protection ...,dolphin,49.0
7111,7111,"pesticides, heavy metals, plastics, and other ...",dolphin,49.0
7112,7112,"hundreds of orcas, animals and other members o...",dolphin,49.0
7113,7113,captured orcas and animals are confined to tan...,dolphin,49.0


In [11]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

X_train, X_test, y_train, y_test = train_test_split(
    df.text, 
    df.class_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.class_num
)



ValueError: Expected 2D array, got 1D array instead:
array=['animal has made a name for himself as a police and military dog, guide and assistance dog, search and rescue dog, and detector dog. he has excelled in every canine sport, including agility, obedience, rally, tracking and, of course, herding.'
 'animals are not ruminants, they have only one stomach, like humans, but unlike humans, they can utilize cellulose, a major component of grass. a 450-kilogram (990 lb) animal will eat 7 to 11 kilograms (15 to 24 lb) of food'
 'animals are solitary animals, and the males and females associate only during the breeding season. they are mainly ground dwellers, but can climb trees with ease and are excellent swimmers. only resident cats with established territories raise litters.'
 ...
 'animal is more an informal classification than a scientific one. experts can often distinguish animal species based merely on the appearance of their horns. some horns form spirals, others are curved, and yet others have ridges.'
 'bushmeat hunters target chimps because they provide more meat than smaller mammals. they sometimes collect their offspring as pets for themselves or to sell into the illegal pet trade. since the 1980s, the ebola virus has killed them in significant numbers.'
 'the scientific name for a black animal is animaltus animaltus. the black animal belongs to the muridae family and the classification mammalia. it is one of the largest animals in the world and is also called a house animal.'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [51]:
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

print(X_train.head())
print(y_train.value_counts())
print(y_test.value_counts())


Shape of X_train:  (5692,)
Shape of X_test:  (1423,)
1237    animal has made a name for himself as a police...
968     animals are not ruminants, they have only one ...
5827    animals are solitary animals, and the males an...
1750    as of 2015, the global wild animal population ...
6528    animals are sensitive and can become depressed...
Name: text, dtype: object
23.0    179
6.0     175
29.0    163
31.0    157
18.0    154
0.0     152
28.0    148
42.0    147
5.0     147
3.0     145
12.0    144
44.0    141
38.0    132
49.0    132
39.0    131
19.0    130
33.0    125
22.0    124
7.0     123
37.0    123
14.0    116
13.0    114
30.0    114
40.0    114
48.0    113
2.0     112
47.0    111
15.0    109
24.0    105
36.0    104
32.0    102
45.0    102
46.0     99
35.0     98
9.0      98
8.0      94
1.0      94
27.0     94
4.0      93
43.0     90
25.0     88
17.0     86
21.0     86
26.0     86
10.0     85
41.0     81
11.0     78
34.0     58
20.0     49
16.0     47
Name: class_num, dtype: int64
2

In [52]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB


#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),    
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.30      0.42      0.35        38
         1.0       0.53      0.39      0.45        23
         2.0       0.65      0.61      0.63        28
         3.0       0.46      0.57      0.51        37
         4.0       0.33      0.22      0.26        23
         5.0       0.33      0.51      0.40        37
         6.0       0.41      0.61      0.49        44
         7.0       0.35      0.61      0.45        31
         8.0       0.48      0.43      0.45        23
         9.0       0.38      0.36      0.37        25
        10.0       0.46      0.29      0.35        21
        11.0       0.50      0.30      0.37        20
        12.0       0.59      0.47      0.52        36
        13.0       0.74      0.69      0.71        29
        14.0       0.53      0.34      0.42        29
        15.0       0.47      0.56      0.51        27
        16.0       1.00      0.17      0.29        12
        17.0       0.29    

In [43]:
### utlity function for pre-processing the text
import spacy

# load english language model and create nlp object from it
nlp = spacy.load("en_core_web_sm") 

def preprocess(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    
    return " ".join(filtered_tokens)

In [44]:
df['preprocessed_txt'] = df['text'].apply(preprocess) 
df.head()


Unnamed: 0.1,Unnamed: 0,text,class,class_num,preprocessed_txt
0,0,this article is about the herbivorous mammals....,Antelope,0.0,article herbivorous mammal use animal disambig...
1,1,"one new world species, the pronghorn of north ...",Antelope,0.0,new world specie pronghorn north america collo...
2,2,"the english word ""animal"" first appeared in 14...",Antelope,0.0,english word animal appear 1417 derive old fre...
3,3,"the word talopus and calopus, from latin, came...",Antelope,0.0,word talopus calopus latin come heraldry 1607 ...
4,4,animal are not a cladistic or taxonomically de...,Antelope,0.0,animal cladistic taxonomically define group fa...


In [45]:
X_train, X_test, y_train, y_test = train_test_split(
    df.preprocessed_txt, 
    df.class_num,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.class_num
)

In [46]:
#1. create a pipeline object
clf = Pipeline([
     ('vectorizer_tfidf',TfidfVectorizer()),        #using the ngram_range parameter 
     ('Random Forest', RandomForestClassifier())         
])

#2. fit with X_train and y_train
clf.fit(X_train, y_train)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(X_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.47      0.50      0.49        38
         1.0       0.29      0.26      0.27        23
         2.0       0.76      0.68      0.72        28
         3.0       0.51      0.59      0.55        37
         4.0       0.31      0.22      0.26        23
         5.0       0.30      0.43      0.36        37
         6.0       0.41      0.57      0.48        44
         7.0       0.41      0.58      0.48        31
         8.0       0.53      0.39      0.45        23
         9.0       0.43      0.52      0.47        25
        10.0       0.44      0.19      0.27        21
        11.0       0.46      0.30      0.36        20
        12.0       0.30      0.31      0.30        36
        13.0       0.62      0.62      0.62        29
        14.0       0.67      0.41      0.51        29
        15.0       0.56      0.56      0.56        27
        16.0       0.50      0.08      0.14        12
        17.0       0.53    