## <span style = "color:maroon">Natural Language Processing</span>

In [1]:
import nltk

In [162]:
paragraph = """A police inspector in Uttar Pradesh's Deoria was caught masturbating at a woman and her daughter when they visited
the police station to lodge a complaint on June 22. Irate over SHO's repeated misbehaviour over days, she recorded a video that 
went viral on Tuesday.
Superintendent of police, Deoria, Shripati Mishra said that an FIR has been lodged against errant inspector Bhishmpal Singh Yadav 
in Bhatni police station on the victim's complaint under the charges of voyeurism, an act intended to outrage the modesty of a 
woman, and public servant disobeying law. "Yadav was suspended on June 26," said the officer. According to the woman's complaint,
she had gone to the police station on 22 June in a land dispute case with her daughter. During that time, Bhatni SHO Bhishmpal 
Singh Yadav was sitting in his office.
The woman in her complaint further alleged that Yadav started doing vulgar acts while talking about the land dispute, during 
which her daughter made a video of the act and showed it to other members of her family.
Later, a person living in the neighbourhood forwarded the video and it went viral on social media.
The viral clip of the incident has caused controversy in the area and locals called for action against the officer."""

### 1. Tokenization

In [58]:
sent = nltk.sent_tokenize(paragraph)

In [3]:
words = nltk.word_tokenize(paragraph)

### 2. Stemming

In [8]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [8]:
stemmer = PorterStemmer()

In [17]:
# stemming process
for i in range(len(sent)):
    words = nltk.word_tokenize(sent[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sent[i] = ' '.join(words)

In [18]:
sent

['the march washington put pressur kennedi administr advanc civil right legisl congress .',
 'the diari arthur M. schlesing jr. , publish posthum 2007 , suggest presid kennedi concern march fail attract larg number demonstr , might undermin civil right effort .',
 'In wake speech march , king name man year time magazin 1963 , 1964 , youngest man ever award nobel peac prize .',
 "[ 54 ] the full speech appear write august 1983 , 15 year king 's death , transcript publish the washington post ."]

### 3. Lemmatization

Run tokenization code again 

In [178]:
from nltk import WordNetLemmatizer

In [179]:
lemmatizer = WordNetLemmatizer()

In [46]:
for i in range(len(sent)):
    words = nltk.word_tokenize(sent[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sent[i] = ' '.join(words)

### 4. Bag of words implementation

Run the code for tokenization again 

In [7]:
import re

In [61]:
# Import PorterStemmer and WordNetLemmatizer and create its objects
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [64]:
# Implementing bag of words using stemmer
corpus = []
for i in range(len(sent)):
    review = re.sub('[^A-Za-z]', ' ', sent[i])
    review = review.lower()
    review = review.split()
    review = [stemmer.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [66]:
# Implementing the bag of words using a Lemmatizer
corpus = []
for i in range(len(sent)):
    review = re.sub('[^a-zA-Z]',' ', sent[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [69]:
# Using count vectorizer to create a vector of bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()

In [71]:
X.shape

(4, 49)

### 5. TF-IDF implementation

Run the code for tokenization again 

In [72]:
corpus = []
for i in range(len(sent)):
    review = re.sub('[^A-Za-z]', ' ', sent[i])
    review = review.lower()
    review = review.split()
    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(corpus).toarray()

### 6. Word2Vec implementation

In [183]:
from gensim.models import Word2Vec

#### 6.1 Data Preprocessing using RegEx

In [169]:
message = re.sub('"',"", paragraph)
message = re.sub(r'\s+', ' ', message)
message = message.lower()
message = re.sub(r'\d', ' ', message)
message = re.sub(r'\s+',' ', message)

#### 6.2 Tokenization 

In [174]:
sent = nltk.sent_tokenize(message)
sent = [nltk.word_tokenize(sen) for sen in sent]

In [180]:
for i in range(len(sent)):
    sent[i] = [lemmatizer.lemmatize(word) for word in sent[i] if word not in set(stopwords.words('english'))]

#### 6.3 Creating the Word2vec model

In [184]:
model = Word2Vec(sent, min_count=1)

In [194]:
words = model.wv.vocab

In [192]:
#vector of the word 'police'
vector = model.wv['police']
vector.shape

In [193]:
# words most similar to the word 'masturbating'
model.wv.most_similar('masturbating')

[('later', 0.2736692428588867),
 ('woman', 0.24795719981193542),
 ('repeated', 0.24496421217918396),
 ('clip', 0.2358691394329071),
 ('office', 0.192185640335083),
 ('officer', 0.18891502916812897),
 ('according', 0.17891228199005127),
 ('.', 0.1730872243642807),
 ('sitting', 0.17226003110408783),
 ('errant', 0.15515246987342834)]