# Stemming

### Reducing a word to its word stem or to the roots of the words(lemma)

In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:
paragraph = "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of 'understanding' the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled 'Computing Machinery and Intelligence' which proposed what is now called the Turing test as a criterion of intelligence, a task that involves the automated interpretation and generation of natural language, but at the time not articulated as a problem separate from artificial intelligence."

In [3]:
#tokanize
sentences = nltk.sent_tokenize(paragraph)

In [4]:
sentences

['Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.',
 "The goal is a computer capable of 'understanding' the contents of documents, including the contextual nuances of the language within them.",
 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.',
 'Natural language processing has its roots in the 1950s.',
 "Already in 1950, Alan Turing published an article titled 'Computing Machinery and Intelligence' which proposed what is now called the Turing test as a criterion of intelligence, a task that involves the automated interpretation and generation of natural language, but at the time not articulated as a problem separate from artificial intelligence."]

In [5]:
sentences_stem = sentences

In [6]:
#stemming
stemmer = PorterStemmer()

#removing stop words and then stemming
for i in range(len(sentences_stem)):
    words = nltk.word_tokenize(sentences_stem[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences_stem[i] = ' '.join(words)

In [7]:
sentences_stem

['natur languag process ( nlp ) subfield linguist , comput scienc , artifici intellig concern interact comput human languag , particular program comput process analyz larg amount natur languag data .',
 "the goal comput capabl 'understand ' content document , includ contextu nuanc languag within .",
 'the technolog accur extract inform insight contain document well categor organ document .',
 'natur languag process root 1950 .',
 "alreadi 1950 , alan ture publish articl titl 'comput machineri intellig ' propos call ture test criterion intellig , task involv autom interpret gener natur languag , time articul problem separ artifici intellig ."]

### In the above output, not all the words make sense. In order to convert out words into their root words while they still make complete sense, we use *lemmatization*.

# Lemmatization

In [8]:
from nltk.stem import WordNetLemmatizer

In [9]:
lemmatizer = WordNetLemmatizer()

In [10]:
sentences_lemmatize = sentences

In [11]:
for i in range(len(sentences_lemmatize)):
    words = nltk.word_tokenize(sentences_lemmatize[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences_lemmatize[i] = ' '.join(words)

In [12]:
sentences_lemmatize

['natur languag process ( nlp ) subfield linguist , comput scienc , artifici intellig concern interact comput human languag , particular program comput process analyz larg amount natur languag data .',
 "goal comput capabl 'understand ' content document , includ contextu nuanc languag within .",
 'technolog accur extract inform insight contain document well categor organ document .',
 'natur languag process root 1950 .',
 "alreadi 1950 , alan ture publish articl titl 'comput machineri intellig ' propos call ture test criterion intellig , task involv autom interpret gener natur languag , time articul problem separ artifici intellig ."]