# Natural language processing
# --------------------------------

#### Tokenization :

In [1]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer

In [3]:
text = "Are you curious about tokenization? Let's see how it works! We need to analyze a couple of sentences with punctuations to see it in action."

In [4]:
sent_tokenize_list = sent_tokenize(text)
print(sent_tokenize_list)

['Are you curious about tokenization?', "Let's see how it works!", 'We need to analyze a couple of sentences with punctuations to see it in action.']


In [5]:
word_tokenize_list = word_tokenize(text)
print(word_tokenize_list)

['Are', 'you', 'curious', 'about', 'tokenization', '?', 'Let', "'s", 'see', 'how', 'it', 'works', '!', 'We', 'need', 'to', 'analyze', 'a', 'couple', 'of', 'sentences', 'with', 'punctuations', 'to', 'see', 'it', 'in', 'action', '.']


In [6]:
word_punkt_tokenizer = WordPunctTokenizer()
print(word_punkt_tokenizer.tokenize(text))

['Are', 'you', 'curious', 'about', 'tokenization', '?', 'Let', "'", 's', 'see', 'how', 'it', 'works', '!', 'We', 'need', 'to', 'analyze', 'a', 'couple', 'of', 'sentences', 'with', 'punctuations', 'to', 'see', 'it', 'in', 'action', '.']


# ------------------------------------------------

## Stemming Text Data

In [7]:
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer

words=['table','probably','wolves','playing','is','cats','the','beaches','grounded','Commonly','envision']
stemmers=['PORTER','LANCASTER','SNOWBALL']

lancasterStemmer = LancasterStemmer()
porterStemmer = PorterStemmer()
snowballStemmer = SnowballStemmer('english')

for word in words:
    stemmedWord = [lancasterStemmer.stem(word), porterStemmer.stem(word), snowballStemmer.stem(word)]
    print(word,stemmedWord)

table ['tabl', 'tabl', 'tabl']
probably ['prob', 'probabl', 'probabl']
wolves ['wolv', 'wolv', 'wolv']
playing ['play', 'play', 'play']
is ['is', 'is', 'is']
cats ['cat', 'cat', 'cat']
the ['the', 'the', 'the']
beaches ['beach', 'beach', 'beach']
grounded ['ground', 'ground', 'ground']
Commonly ['common', 'commonli', 'common']
envision ['envid', 'envis', 'envis']


### Lemmatization of word

In [8]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()
 
for word in words:
    lemmatizedWord = [wnl.lemmatize(word, pos='v'), wnl.lemmatize(word, pos='n')]
    print(word,"      \t-->", *lemmatizedWord)

table       	--> table table
probably       	--> probably probably
wolves       	--> wolves wolf
playing       	--> play playing
is       	--> be is
cats       	--> cat cat
the       	--> the the
beaches       	--> beach beach
grounded       	--> ground grounded
Commonly       	--> Commonly Commonly
envision       	--> envision envision


### Chunking word

In [9]:
from nltk.corpus import brown
import numpy as np

def splitter(data, n):
    words = data.split(" ")
    output = []
    c_count = 0
    c_words = []
    for word in words:
        c_words.append(word)
        c_count += 1
        if(c_count == n):
            output.append(" ".join(c_words))
            c_count = 0
            c_words = []
    output.append(" ".join(c_words))
    return(output)

data = " ".join(brown.words()[:10000])
# print(data)
n = 1700
output = splitter(data, n)
print(len(output))
# print(output)

6


### Building a text classifier

In [20]:
# WE WILL COVER IT AFTER DEATH


# from sklearn.datasets import fetch_20newsgroups

# categoryMap = {'misc.forsale':'Sales', 'rec.motorcycles' : 'Motorcycles', 'rec.sport.baseball' : 'Baseball', 'sci.crypt':'Cryptography', 'sci.space' : 'Space'}
# trainingData = fetch_20newsgroups(subset='train', categories=categoryMap.keys(), shuffle=True, random_state=7)
# print(trainingData)

In [22]:
from nltk.corpus import names
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy

def gender_features(word,n=2):
    return ({'feature': word[-n:].lower()})
labels=[(name,'male') for name in names.words('male.txt')]+[(name,'female') for name in names.words('female.txt')]
print(labels[1:3])
#random.seed(7)
#random.shuffle(labels)
data=['Leonardo','Amy','Sem','levely','king']


feature_set=[(gender_features(n,3),gender) for (n,gender) in labels]
training,testing=feature_set[500:],feature_set[200:]
classifier=NaiveBayesClassifier.train(training)
print(accuracy(classifier,testing)*100)
for name in data:
    print(name,'-->',classifier.classify(gender_features(name,5)))


[('Aaron', 'male'), ('Abbey', 'male')]
84.81404958677686
Leonardo --> female
Amy --> female
Sem --> female
levely --> female
king --> female


In [29]:
from nltk.corpus import names
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy

def genderFeature(names, n=2):
    return ({'feature' : word[-1:].lower()})

labels = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
print(labels[1:3])
data=['Leonardo','Amy','Sem','levely','king']
feature_set = [(genderFeature(n, 3),gender) for (n, gender) in labels]
training, testing = feature_set[500:], feature_set[200:]
classifier = NaiveBayesClassifier.train(training)
print(accuracy(classifier, testing)*100)

for name in data:
    print(name, "     \t--> ", classifier.classify(genderFeature(name,5)))

[('Aaron', 'male'), ('Abbey', 'male')]
63.09400826446281
Leonardo      	-->  female
Amy      	-->  female
Sem      	-->  female
levely      	-->  female
king      	-->  female


In [33]:
# Second using random function
import random
from nltk.corpus import names
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy

def genderFeature(word, n=2):
    return ({'feature': word[-n:].lower()})

labels = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.seed(7)
random.shuffle(labels)
data=['Leonardo','Amy','Sem','levely','king']

for i in range(1,5):
    print("\n\n",i)
    feature_set = [(genderFeature(n,i), gender) for (n, gender) in labels]
    training, testing = feature_set[500:], feature_set[:500]
    classifier = NaiveBayesClassifier.train(training)
    print(accuracy(classifier, testing)*100)
    for name in data:
        print(name, "    \t--> ", classifier.classify(genderFeature(name, i)))



 1
76.2
Leonardo     	-->  male
Amy     	-->  female
Sem     	-->  male
levely     	-->  female
king     	-->  male


 2
78.60000000000001
Leonardo     	-->  male
Amy     	-->  female
Sem     	-->  male
levely     	-->  female
king     	-->  male


 3
76.6
Leonardo     	-->  male
Amy     	-->  female
Sem     	-->  female
levely     	-->  female
king     	-->  male


 4
70.8
Leonardo     	-->  male
Amy     	-->  female
Sem     	-->  female
levely     	-->  female
king     	-->  male


## Positive || Negative

In [41]:
import nltk.classify.util 
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

def extract_features(wordlist):
    return dict([(word,True) for word in wordlist])

#retrieving positive and negative fileids from nltk.movie_reviews
p_fields=movie_reviews.fileids('pos')
n_fields=movie_reviews.fileids('neg')

#saperating fields into positive and negative reviews
f_positive=[(extract_features(movie_reviews.words(fileids=[f])),"Positive") for f in p_fields]
f_negative=[(extract_features(movie_reviews.words(fileids=[f])),"Negative") for f in n_fields]

#dividing data into training and testing datasets
factor=0.8
fPositive=int(factor*len(f_positive))
fNegative=int(factor*len(f_negative))

#extract the features
f_train=f_positive[:fPositive] + f_negative[:fNegative]
f_test=f_positive[fPositive:] + f_negative[fNegative:]

#using naivebase classifier for classification
classifier=NaiveBayesClassifier.train(f_train)
print("Accuracy if a classifier : ",end=" ")
print(nltk.classify.util.accuracy(classifier,f_test))
print("="*80)

#extracting most informative words  from classifier
print("Top 10 Most Informative Words:")
for i in classifier.most_informative_features()[:10]:
    print(i[0])
print("="*80)
    
data=['it is an amazing movie','this is a dull movie. i will never recommend it to anyone.',
      'the cinemetography is pretty great in this movie.',
      'the direction was terrible and the story was all over the place.','SAHO is the best movie.',
     'She is bad girl.']
for i in data:
    prob_list=classifier.prob_classify(extract_features(i.split(" ")))
    pred_sentiment=prob_list.max()
    print(i," \t=>",pred_sentiment)

Accuracy if a classifier :  0.735
Top 10 Most Informative Words:
outstanding
insulting
vulnerable
ludicrous
uninvolving
astounding
avoids
fascination
affecting
animators


AttributeError: 'str' object has no attribute 'color'