<a href="https://colab.research.google.com/github/lovepreetmultani/python/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [85]:
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**Stemming with stopwords**

In [73]:
paragraph = """Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow,
    Professor Plum has a green plant in his study,
    Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."""

In [74]:
sentences_tokenized = nltk.sent_tokenize(paragraph)

In [75]:
stemmer = PorterStemmer()

In [None]:
stopwords.words('english')

In [77]:
for i in range(len(sentences_tokenized)):
  words = nltk.word_tokenize(sentences_tokenized[i])
  words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
  sentences_tokenized[i] = ' '.join(words)

In [78]:
sentences_tokenized

['mr. green kill colonel mustard studi candlestick .',
 "mr. green nice fellow , professor plum green plant studi , miss scarlett water professor plum 's green plant away offic last week ."]

**Lemmatizer with stopwords**

In [79]:
paragraph = """Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow,
    Professor Plum has a green plant in his study,
    Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."""

In [80]:
sentences_tokenized = nltk.sent_tokenize(paragraph)

In [81]:
lemmatizer = WordNetLemmatizer()

In [82]:
for i in range(len(sentences_tokenized)):
  words = nltk.word_tokenize(sentences_tokenized[i])
  words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
  sentences_tokenized[i] = ' '.join(words)

In [83]:
sentences_tokenized

['Mr. Green killed Colonel Mustard study candlestick .',
 "Mr. Green nice fellow , Professor Plum green plant study , Miss Scarlett watered Professor Plum 's green plant away office last week ."]

**Bag Of Words**

In [89]:
paragraph = """Mr. Green killed Colonel Mustard in the study with the candlestick. Mr. Green is not a very nice fellow,
    Professor Plum has a green plant in his study,
    Miss Scarlett watered Professor Plum's green plant while he was away from his office last week."""

In [90]:
sentences_tokenized = nltk.sent_tokenize(paragraph)

In [91]:
ps = PorterStemmer()
wordnet = WordNetLemmatizer()

In [93]:
corpus = []

In [99]:
for i in range(len(sentences_tokenized)):
    review = re.sub('[^a-zA-Z]', ' ', sentences_tokenized[i]) # removing all symbols
    review = review.lower()
    review = review.split()
    review = [wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [100]:
corpus

['mr green kill colonel mustard studi candlestick',
 'mr green kill colonel mustard studi candlestick',
 'mr green nice fellow professor plum green plant studi miss scarlett water professor plum green plant away offic last week',
 'mr green killed colonel mustard study candlestick',
 'mr green nice fellow professor plum green plant study miss scarlett watered professor plum green plant away office last week']

In [105]:
# Creating the Bag of Words model
#from sklearn.feature_extraction.text import CountVectorizer
#cv = CountVectorizer()
#X = cv.fit_transform(corpus).toarray()

# Creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()

In [106]:
X

array([[0.        , 0.39333024, 0.39333024, 0.        , 0.27985771,
        0.47384029, 0.        , 0.        , 0.        , 0.27985771,
        0.39333024, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.39333024, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.39333024, 0.39333024, 0.        , 0.27985771,
        0.47384029, 0.        , 0.        , 0.        , 0.27985771,
        0.39333024, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.39333024, 0.        ,
        0.        , 0.        , 0.        ],
       [0.19518032, 0.        , 0.        , 0.19518032, 0.34582993,
        0.        , 0.        , 0.19518032, 0.19518032, 0.11527664,
        0.        , 0.19518032, 0.24192098, 0.        , 0.39036065,
        0.39036065, 0.39036065, 0.19518032, 0.1620173 , 0.        ,
        0.24192098, 0.        , 0.19518032],
       [0.        , 0.36053075, 0.36053075, 0.   