# Natural Language Processing in Python

## 1) Build your own spam detector

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import AdaBoostClassifier

In [3]:
data = pd.read_csv ( r'F:/DATA SCIENCE/Udemy/Lazy Programmer Codes/nlp_class/spambase.data' ).as_matrix()
data.shape

(4600, 58)

In [5]:
np.random.shuffle(data)

In [6]:
X = data[: , :48]
Y = data[: , -1]

Xtrain = X[:-100,]
Ytrain = Y[:-100,]

Xtest = X[-100:,]
Ytest = Y[-100:,]

In [7]:
model = MultinomialNB()
model.fit(Xtrain, Ytrain)
print ( "Classification Rate : ", model.score(Xtest, Ytest))

Classification Rate :  0.89


In [8]:
model = AdaBoostClassifier()
model.fit(Xtrain, Ytrain)
print ( "Classification Rate : ", model.score(Xtest, Ytest))

Classification Rate :  0.94


## 2) Sentiment Analysis in Python

In [9]:
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from bs4 import BeautifulSoup

In [10]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = set( w.rstrip() for w in open( r'F:/DATA SCIENCE/Udemy/Lazy Programmer Codes/nlp_class/stopwords.txt' ) )

In [11]:
positive_reviews = BeautifulSoup( open( r'F:/DATA SCIENCE/Udemy/Lazy Programmer Codes/nlp_class/electronics/positive.review' ).read())
positive_reviews = positive_reviews.findAll('review_text')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [12]:
negative_reviews = BeautifulSoup( open( r'F:/DATA SCIENCE/Udemy/Lazy Programmer Codes/nlp_class/electronics/negative.review' ).read())
negative_reviews = negative_reviews.findAll('review_text')



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [13]:
# to balance classes remove excess positive reviews
np.random.shuffle(positive_reviews)
positive_reviews = positive_reviews[:len(negative_reviews)]

In [14]:
word_index_map = {}
current_index = 0

positive_tokenized = []
negative_tokenized = []

def my_tokenizer(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [ t for t in tokens if len(t) > 2 ]
    tokens = [ wordnet_lemmatizer.lemmatize(t) for t in tokens ]
    tokens = [ t for t in tokens if t not in stopwords ]
    return(tokens)

for review in positive_reviews:
    tokens = my_tokenizer(review.text)
    positive_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1
            
for review in negative_reviews:
    tokens = my_tokenizer(review.text)
    negative_tokenized.append(tokens)
    for token in tokens:
        if token not in word_index_map:
            word_index_map[token] = current_index
            current_index += 1

In [15]:
def tokens_to_vector(tokens, label):
    x = np.zeros(len(word_index_map) + 1)
    for t in tokens:
        i = word_index_map[t]
        x[i] += 1
    x = x / x.sum()
    x[-1] = label
    return(x)

N = len(positive_tokenized) + len(negative_tokenized)

data = np.zeros( ( N, len(word_index_map) + 1 ) )

i = 0
for tokens in positive_tokenized:
    xy = tokens_to_vector(tokens,1)
    data[i,:] = xy
    i += 1
    
for tokens in negative_tokenized:
    xy = tokens_to_vector(tokens,0)
    data[i,:] = xy
    i += 1
    
np.random.shuffle(data)

X = data[:, :-1]
Y = data[:, -1]

Xtrain = X[:-100,]
Ytrain = Y[:-100,]

Xtest = X[-100:,]
Ytest = Y[-100:,]

In [16]:
model = LogisticRegression()
model.fit(Xtrain, Ytrain)

print("Classification Rate : ", model.score(Xtest, Ytest))

Classification Rate :  0.76


In [17]:
# find which words are important

threshold = 0.5
for word, index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold or weight < - threshold:
        print(word, weight)

recommend 0.707745558383
memory 0.968073533779
card -0.61323300238
time -0.53426041347
wa -1.59332433715
perfect 1.00131542981
picture 0.617865332471
unit -0.63758900665
expected 0.536954931484
n't -2.10426609708
you 1.09425593195
buy -0.89979593573
pretty 0.734159951149
return -1.11864656115
quality 1.3463306804
bad -0.75021214785
using 0.582630802939
week -0.692866519356
look 0.530768241494
speaker 0.746991608472
doe -1.25473538039
price 2.73198951137
've 0.711045968867
highly 1.00928355649
ha 0.654121256476
excellent 1.1514568555
tried -0.737049057418
paper 0.605521420282
fast 0.859333108815
cable 0.65707258778
lot 0.726713979772
easy 1.77032371407
support -0.875286662718
space 0.576105687709
money -1.08566673061
then -1.014893976
love 1.15798006431
month -0.791614375306
comfortable 0.672198154045
bit 0.640178094671
value 0.567970348459
sound 1.06939770556
home 0.53964547643
little 0.79689666349
video 0.603595745212
fit 0.516472379457
pro 0.517785202908
try -0.698361844297
happy 0.5

## 3) NLTK Exploration

In [18]:
nltk.pos_tag('Bob is great'.split())

[('Bob', 'NNP'), ('is', 'VBZ'), ('great', 'JJ')]

In [19]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('wolves'))

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('wolves'))

wolv
wolf


In [20]:
# Named Entity Recongnition
s = "Nitish Jaipuria was born on 17th september, 1988"

tags = nltk.pos_tag(s.split())
nltk.ne_chunk(tags).draw()

## Latent Semantic Analysis

In [31]:
from sklearn.decomposition import TruncatedSVD

In [32]:
wordnet_lemmatizer = WordNetLemmatizer()

titles = [ line.rstrip() for line in open(r'F:/DATA SCIENCE/Udemy/Lazy Programmer Codes/nlp_class/all_book_titles.txt')]
stopwords = set ( w.rstrip() for w in open(r'F:/DATA SCIENCE/Udemy/Lazy Programmer Codes/nlp_class/stopwords.txt'))

stopwords = stopwords.union({
    'introduction', 'edition', 'series', 'application',
    'approach', 'card', 'access', 'package', 'plus', 'etext',
    'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed',
    'third', 'second', 'fourth', })

In [33]:
def my_tokenizer_no(s):
    s = s.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    tokens = [ t for t in tokens if len(t) > 2 ]
    tokens = [ wordnet_lemmatizer.lemmatize(t) for t in tokens ]
    tokens = [ t for t in tokens if t not in stopwords ]
    tokens = [ t for t in tokens if not any(c.isdigit()) for c in t ] # removing numbers as well
    return(tokens)

In [34]:
word_index_map = {}
current_index = 0
all_tokens = []
all_titles = []
index_word_map = []

In [35]:
for title in titles:
    try:
        title = title.encode('ascii', 'ignore')
        all_titles.append(title)
        tokens = my_tokenizer_no(title)
        all_tokens.append(tokens)
        for token in tokens:
            if token not in word_index_map:
                word_index_map[token] = current_index
                current_index += 1
                index_word_map.append(token)
    except:
        pass

In [36]:
def tokens_to_vector_nolabel(tokens):
    x = np.zeros(len(word_index_map))
    for t in tokens:
        i = word_index_map[t]
        x[i] = 1
    return(x)

In [38]:
N = len(all_tokens)
D = len(word_index_map)
X = np.zeros((D,N))
i = 0

for token in all_tokens:
    X[:,i] = tokens_to_vector_nolabel(token)
    i += 1

In [None]:
svd = TruncatedSVD()
Z = svd.fit_transform(X)

In [None]:
plt.scatter(Z[:,0], Z[:,1])
for i in xrange(D):
    plt.annotate(s=index_word_map[i], xy=(Z[i,0], Z[i,1]))
plt.show()

## Article Spinning