In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

import contractions
from string import punctuation


In [2]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
stop_punctuation = [p for p in punctuation]
stoppers = stop_words+stop_punctuation
stoppers.remove('not')

def clean_and_tokenize(txt):
    def get_wordnet_pos(tag):
        tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

        return tag_dict.get(tag[0], wordnet.NOUN)

    sentences = sent_tokenize(txt)
    cleaned_txt = []
    for sentence in sentences:
        expanded_sentences = contractions.fix(sentence)
        tokens = [token for token in word_tokenize(expanded_sentences)]
        tagged_tokens = pos_tag(tokens)
        for token, tag in tagged_tokens:
            if len(token)>2:
                lemma_word = lemmatizer.lemmatize(token,get_wordnet_pos(tag))
                if lemma_word not in stoppers:
                    stemmed_word = stemmer.stem(lemma_word)
                    cleaned_txt.append(stemmed_word)
    return " ".join(cleaned_txt)

In [3]:
training_data = np.array([['a dog barks','dog'],
                          ['dogs are friendly animals','dog'],
                          ['her pet won\'t stop barking','dog'],
                          ['bob is waggling his tail. He is very firendly','dog'],
                          ['her dog rarely barks','dog'],
                          ['barking dogs don\'t bite','dog'],
                          ['cats are anti-social animals','cat'],
                          ['a cat snores','cat'], 
                          ['my pet sleeps all day long', 'cat'],
                          ['his pet is not a social animal','cat'],
                          ['tom snores a lot','cat'],
                          ['her cat barks','cat'],
                          ['cats sleep over 12 hours a day','cat']
                          ])

In [4]:
stoppers = stoppers +['pet', 'bob', 'tom']

In [5]:
stemmed_txt = [clean_and_tokenize(text) for text in training_data[:,0]]

In [6]:
tfidf = TfidfVectorizer(analyzer="word",
                        sublinear_tf=True, 
                        use_idf=True)

features = tfidf.fit_transform(stemmed_txt).toarray()

In [7]:
labels = training_data[:,1]

In [8]:
print(f"Total words in dictionary:{len(tfidf.vocabulary_)}")
print(f"Top 10 most frequent words:")
top_10= sorted(tfidf.vocabulary_, key=lambda x: x[1],reverse=True)[:10]
print("\t> "+"\n\t> ".join(top_10))

Total words in dictionary:21
Top 10 most frequent words:
	> stop
	> friendli
	> dog
	> not
	> soci
	> long
	> social
	> lot
	> hour
	> anim


In [9]:
clf = RandomForestClassifier(n_estimators=50,min_samples_split=2,random_state=1123)
cross_validate(clf, features, labels, cv=4)
clf.fit(features, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=1123, verbose=0,
            warm_start=False)

In [10]:
test_data = np.array([
    ['Max always waggles his tail when I arrive home. He is really friendly.','dog'],
    ['Sophie is always sleeping when I arrive home. She is not much into socialization.','cat'],
    ['Bob is so old that he can not do anything else but sleep','dog'],
    ['I like friendly animals. All-day-sleeping pets is not my thing.','cat'],
    ['Bob\'s. snoring is weird. He might be sick.','cat'],
])

In [11]:
X_test = [clean_and_tokenize(text) for text in test_data[:,0]]

In [12]:
X_test = tfidf.transform(X_test)

In [13]:
y_test = test_data[:,1]

In [14]:
y_pred = clf.predict(X_test)

In [15]:
proba = clf.predict_proba(X_test)
for i in range(test_data.shape[0]):
    print(test_data[i][0])
    print(f"Cat:{proba[i][0]}| Dog: {proba[i][1]}\n")


Max always waggles his tail when I arrive home. He is really friendly.
Cat:0.32| Dog: 0.68

Sophie is always sleeping when I arrive home. She is not much into socialization.
Cat:0.76| Dog: 0.24

Bob is so old that he can not do anything else but sleep
Cat:0.54| Dog: 0.46

I like friendly animals. All-day-sleeping pets is not my thing.
Cat:0.74| Dog: 0.26

Bob's. snoring is weird. He might be sick.
Cat:0.76| Dog: 0.24



In [16]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         cat       0.75      1.00      0.86         3
         dog       1.00      0.50      0.67         2

   micro avg       0.80      0.80      0.80         5
   macro avg       0.88      0.75      0.76         5
weighted avg       0.85      0.80      0.78         5



## Approach I: 
#### Add more samples

In [17]:
training_data = np.append(training_data, np.array([['old dogs tend to sleep for longer periods','dog'],
                                  ['cats spend a lot of time grooming.', 'cat']]), axis=0)

In [18]:
stemmed_txt = [clean_and_tokenize(text) for text in training_data[:,0]]
features = tfidf.fit_transform(stemmed_txt)
labels = training_data[:,1]
clf.fit(features, labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=None,
            oob_score=False, random_state=1123, verbose=0,
            warm_start=False)

In [19]:
X_test = [clean_and_tokenize(text) for text in test_data[:,0]]

In [20]:
X_test = tfidf.transform(X_test)

In [21]:
y_pred = clf.predict(X_test)
proba = clf.predict_proba(X_test)
for i in range(test_data.shape[0]):
    print(test_data[i][0])
    print(f"Cat:{proba[i][0]}| Dog: {proba[i][1]}\n")

Max always waggles his tail when I arrive home. He is really friendly.
Cat:0.4| Dog: 0.6

Sophie is always sleeping when I arrive home. She is not much into socialization.
Cat:0.74| Dog: 0.26

Bob is so old that he can not do anything else but sleep
Cat:0.66| Dog: 0.34

I like friendly animals. All-day-sleeping pets is not my thing.
Cat:0.8| Dog: 0.2

Bob's. snoring is weird. He might be sick.
Cat:0.76| Dog: 0.24



In [22]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         cat       0.75      1.00      0.86         3
         dog       1.00      0.50      0.67         2

   micro avg       0.80      0.80      0.80         5
   macro avg       0.88      0.75      0.76         5
weighted avg       0.85      0.80      0.78         5



## Approach II:
#### Extend test data

In [23]:
test_data = np.array([
    ['Max always waggles his tail when I arrive home. He is really friendly.','dog'],
    ['Sophie is always sleeping when I arrive home. She is not much into socialization.','cat'],
    ['Bob is so old that he can not do anything else but sleep. He barely barks.','dog'],
    ['I like friendly animals. All-day-sleeping pets is not my thing.','cat'],
    ['Bob\'s. snoring is weird. He might be sick.','cat'],
])
X_test = [clean_and_tokenize(text) for text in test_data[:,0]]
X_test = tfidf.transform(X_test)
y_pred = clf.predict(X_test)
proba = clf.predict_proba(X_test)
for i in range(test_data.shape[0]):
    print(test_data[i][0])
    print(f"Cat:{proba[i][0]}| Dog: {proba[i][1]}\n")

print(metrics.classification_report(y_test, y_pred))

Max always waggles his tail when I arrive home. He is really friendly.
Cat:0.4| Dog: 0.6

Sophie is always sleeping when I arrive home. She is not much into socialization.
Cat:0.74| Dog: 0.26

Bob is so old that he can not do anything else but sleep. He barely barks.
Cat:0.42| Dog: 0.58

I like friendly animals. All-day-sleeping pets is not my thing.
Cat:0.8| Dog: 0.2

Bob's. snoring is weird. He might be sick.
Cat:0.76| Dog: 0.24

              precision    recall  f1-score   support

         cat       1.00      1.00      1.00         3
         dog       1.00      1.00      1.00         2

   micro avg       1.00      1.00      1.00         5
   macro avg       1.00      1.00      1.00         5
weighted avg       1.00      1.00      1.00         5

