In [16]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

import contractions
from string import punctuation


In [17]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')
stop_punctuation = [p for p in punctuation]
stoppers = stop_words+stop_punctuation
stoppers.remove('not')

def clean_and_tokenize(txt):
    def get_wordnet_pos(tag):
        tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

        return tag_dict.get(tag[0], wordnet.NOUN)

    sentences = sent_tokenize(txt)
    cleaned_txt = []
    for sentence in sentences:
        expanded_sentences = contractions.fix(sentence)
        tokens = [token for token in word_tokenize(expanded_sentences)]
        tagged_tokens = pos_tag(tokens)
        for token, tag in tagged_tokens:
            if len(token)>2:
                lemma_word = lemmatizer.lemmatize(token,get_wordnet_pos(tag))
                if lemma_word not in stoppers:
                    stemmed_word = stemmer.stem(lemma_word)
                    cleaned_txt.append(stemmed_word)
    return " ".join(cleaned_txt)

In [18]:
training_data = np.array([['a dog barks','dog'],
                          ['dogs are friendly animals','dog'],
                          ['her pet won\'t stop barking','dog'],
                          ['bob is waggling his tail','dog'],
                          ['her dog rarely barks','dog'],
                          ['my dog barks','dog'],
                          ['cats are anti-social animals','cat'],
                          ['a cat snores','cat'], 
                          ['my pet sleeps all day long', 'cat'],
                          ['his pet is not really social','cat'],
                          ['tom snores a lot','cat'],
                          ['her cat barks','cat']
                          ])

In [19]:
stoppers = stoppers +['pet', 'bob', 'tom']

In [20]:
stemmed_txt = [clean_and_tokenize(text) for text in training_data[:,0]]

In [21]:
tfidf = TfidfVectorizer(analyzer="word",
                       sublinear_tf=True, 
                        use_idf=True,
                        norm='l2',
                        ngram_range=(1,2))

features = tfidf.fit_transform(stemmed_txt).toarray()

In [22]:
labels = training_data[:,1]

In [23]:
print(f"Total words in dictionary:{len(tfidf.vocabulary_)}")
print(f"Top 10 most frequent words:")
top_10= sorted(tfidf.vocabulary_, key=lambda x: x[1],reverse=True)[:10]
print("\t> "+"\n\t> ".join(top_10))

Total words in dictionary:37
Top 10 most frequent words:
	> stop
	> stop bark
	> friendli
	> friendli anim
	> dog
	> dog bark
	> dog friendli
	> not
	> not stop
	> dog rare


In [24]:
clf = RandomForestClassifier()
clf.fit(features, labels)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
test_data = np.array([
    ['Max always waggles his tail when I arrive home. He is really friendly.','dog'],
    ['Sophie is always sleeping when I arrive home. She is not much into socialization.','cat'],
    ['My dog is so old that he can not do anything else but sleep.','dog'],
    ['I like friendly animals. All-day-sleeping pets is not my thing.','dog']
])

In [26]:
X_test = [clean_and_tokenize(text) for text in test_data[:,0]]

In [27]:
X_test = tfidf.transform(X_test)

In [28]:
y_test = test_data[:,1]

In [29]:
y_pred = clf.predict(X_test)

In [30]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         cat       0.50      1.00      0.67         1
         dog       1.00      0.67      0.80         3

   micro avg       0.75      0.75      0.75         4
   macro avg       0.75      0.83      0.73         4
weighted avg       0.88      0.75      0.77         4

