## Stemming vs lemmatization

In [1]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
snow_stemmer = SnowballStemmer(language='english')
porter_stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
#list of tokenized words
words = ['cared','university','mice','easily','singing',
	'language','corpora','singer','sportingly','rocks']

#stem's of each word
stem_words_snowball = []
stem_words_porter = []
stem_words_lemma = []

In [3]:
for w in words:
  xs = snow_stemmer.stem(w)
  xp = porter_stemmer.stem(w)
  xl = lemmatizer.lemmatize(w)
  
  stem_words_snowball.append(xs)
  stem_words_porter.append(xp)
  stem_words_lemma.append(xl)

In [4]:
print(f"{'word':15}{'snow_ball':15}{'porter':15}{'lemma':15}")
print('-'*60)
for word,snow_ball,porter,lemma in zip(words,stem_words_snowball,stem_words_porter,stem_words_lemma):
  print(f"{word:15}{snow_ball:15}{porter:15}{lemma:15}")

word           snow_ball      porter         lemma          
------------------------------------------------------------
cared          care           care           cared          
university     univers        univers        university     
mice           mice           mice           mouse          
easily         easili         easili         easily         
singing        sing           sing           singing        
language       languag        languag        language       
corpora        corpora        corpora        corpus         
singer         singer         singer         singer         
sportingly     sport          sportingli     sportingly     
rocks          rock           rock           rock           


In [5]:
corpus = 'Hi Machine learning aspirants. Hope you are doing great. NLP is good.'

sent_tokens = nltk.sent_tokenize(corpus)
word_tokens = nltk.word_tokenize(corpus)

print(sent_tokens,word_tokens)

['Hi Machine learning aspirants.', 'Hope you are doing great.', 'NLP is good.'] ['Hi', 'Machine', 'learning', 'aspirants', '.', 'Hope', 'you', 'are', 'doing', 'great', '.', 'NLP', 'is', 'good', '.']


In [74]:
from nltk.corpus import stopwords
import re

corpus = "You cannot believe in god until you believe in yourself. The greatest sin is to think that you are weak. Believe in yourself and the world will be at your feet." #quotes of swami vivekananda
pattern = r'[^a-zA-Z\s]'

corpus = nltk.sent_tokenize(corpus)      #converting paragraph into sentences

corpus_cleaned = []
words = []
for sentence in corpus:
    sent_cleaned = re.sub(pattern,'',sentence)
    sent = ''
    word_tokens = nltk.word_tokenize(sent_cleaned)
    words_filtered = [word.lower() for word in word_tokens if not word.lower() in stop_words]
    sent = ' '.join(words_filtered)
    corpus_cleaned.append(sent)
    words.extend(words_filtered)

words_set = sorted(set(words))

print(corpus_cleaned)
print(words_set)

bow = dict([(a,words.count(a)) for a in words])
print(bow)

['believe god believe', 'greatest sin think weak', 'believe world feet']
['believe', 'feet', 'god', 'greatest', 'sin', 'think', 'weak', 'world']
{'believe': 3, 'god': 1, 'greatest': 1, 'sin': 1, 'think': 1, 'weak': 1, 'world': 1, 'feet': 1}


### Bag of Words

In [75]:
vectors = {}.fromkeys(corpus_cleaned, [])
for sentence in vectors:
    vector = []
    for word in words_set:
        if word in sentence:
            vector.append(bow[word])
        else:
            vector.append(0)
    vectors[sentence] = vector
vectors['bag of words']= words_set
print(vectors)

{'believe god believe': [3, 0, 1, 0, 0, 0, 0, 0], 'greatest sin think weak': [0, 0, 0, 1, 1, 1, 1, 0], 'believe world feet': [3, 1, 0, 0, 0, 0, 0, 1], 'bag of words': ['believe', 'feet', 'god', 'greatest', 'sin', 'think', 'weak', 'world']}
