In [None]:
import nltk

In [None]:
import sys
import sklearn

print('Python: {}'.format(sys.version))
print('nltk: {}'.format(nltk.__version__))
print('Sklearn: {}'.format(sklearn.__version__))

 ### corpus
 (literally Latin for body) refers to a collection of texts.
 ### lexicon 
 words and their meaning
 ### token
 Python breaks each logical line into a sequence of elementary lexical components known as tokens. Each token corresponds to a substring of the logical line. The normal token types are identifiers, keywords, operators, delimiters, and literals

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

text ='Hello students, how are you doing today? The olympics are inspiring and python is awesome. You look great today.'
print(sent_tokenize(text))

In [None]:
print(word_tokenize(text))

In [None]:
#removing stop words -useless data
from nltk.corpus import stopwords
print(set(stopwords.words('english')))

In [None]:
example = 'This is some sample text, showing off some stopwords filtration.'
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example)
sentence = [w for w in word_tokens if not w in  stop_words]
filtered_sentence = []
for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
        
print(word_tokens)
print(sentence)
print(filtered_sentence)

In [None]:
#stemming words with NLTK
from nltk.stem import PorterStemmer

ps = PorterStemmer()

example_words = ['ride', 'riding','rider','rides']
for w in example_words:
    print(ps.stem(w))

In [None]:
#stemming an entire sentence
new_text = 'When riders are riding their horses, thry often think of cowboys rode horses.'
words = word_tokenize(new_text)

for w in words:
    print(ps.stem(w))

In [None]:
from nltk.corpus import stopwords

In [None]:
from nltk.corpus import udhr
print(udhr.raw('English-Latin1'))

In [None]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer


train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')


In [None]:
print(train_text)

In [None]:
# now we have some tet, we can train the Punktsentencetokenizer

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [None]:
#now lets tokenize th sample text

tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [None]:
print(tokenized)


In [None]:
#define a function that will tag each tokenized word with a part of speech

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
            
    except Exception as e:
        print(str(e))
        
process_content()        

In [None]:
nltk.help.upenn_tagset()

In [None]:
nltk.download()

In [None]:
nltk.help.upenn_tagset()

In [None]:
#chunking with NLTK

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

#define a function that will tag each tokenized word with a part of speech

def process_content():
    try:
        for i in tokenized[:50]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            #combine the part of speech tag with regular expression
            chunkGram = r"""Chunk: {<RB.?>*<VG.?>*<NNP>+<NN>?}"""
            chunkParser = nltk.RegexpParser(chunkGram) 
            chunked = chunkParser.parse(tagged)
            
            #print the nltk tree
            for subtree in chunked.subtrees(filter = lambda t: t.label() == 'Chunk'):
                print(subtree)
            #draw the chunks with nltk
            chunked.draw()
            
            
    except Exception as e:
        print(str(e))
        
process_content()    

In [None]:
# chinking the NLTK
train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

#define a function that will tag each tokenized word with a part of speech

def process_content():
    try:
        for i in tokenized[:20]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            #combine the part of speech tag with regular expression
            chunkGram = r"""Chunk: {<.*>+}
                                          }<VG.?|IN|DT|TO>+{"""
            chunkParser = nltk.RegexpParser(chunkGram) 
            chunked = chunkParser.parse(tagged)
            
            #print the nltk tree
             # print(chunked)
            for subtree in chunked.subtrees(filter = lambda t: t.label() == 'Chunk'):
                print(subtree)
            #draw the chunks with nltk
            chunked.draw()
            
            
    except Exception as e:
        print(str(e))
        
process_content()    

In [None]:
def process_content():
    try:
        for i in tokenized[:20]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged, binary = False)
            
            #draw the chunks with nltk
            namedEnt.draw()
            
            
    except Exception as e:
        print(str(e))
        
process_content()    

### Text classification using NLTK

In [3]:
import random
import nltk
from nltk.corpus import movie_reviews


In [4]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [None]:
# build list of documents
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

#shuffle the documents
random.shuffle(documents)

print('Number of Documents: {}'.format(len(documents)))
print('First Reviews: {}'.format(documents[0]))

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower)
    
all_words = nltk.FreqDist(all_words)

print('most common words: {}'.format(all_words.most_common(15)))
print('The word happy: {}'.format(all_words["happy"]))


In [6]:
print(len(all_words))

1583820


In [8]:
#we will use the 4000 most common words as features
word_features = list(all_words)[:4000]


In [18]:
#Build a find_features function that will determine which of  the 4000 words features are contained in the review
def find_features(document):
    words = set(document)
    features = {}
    
    
    for w in word_features:
        features[w] = (w in words)
        
    return features

#let use an example from a negative review

features = find_features(movie_reviews.words('neg/cv000_29416.txt'))
for key, value in features.items():
    if value == True:
        print(key)

In [None]:
print(features)

In [21]:
#now lets do it for all the documents
featuresets = [(find_features(rev),category) for(rev, category) in documents]

In [23]:
#we can split the featuresets into training and testing dataset using sklearn
from sklearn import model_selection

#define a seed for reproducibility
seed = 1

#split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [24]:
print(len(training))
print(len(testing))

1500
500


In [29]:
#how  we use sklearn algorithm in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

In [33]:
model = SklearnClassifier(SVC(kernel = 'linear'))

In [None]:
model.train(training)

In [None]:
#test on the training dataset
accuracy = nltk.classify.accuracy(model, testing)
print('SVC Accuracy: {}'.format(accuracy))