In [None]:
# Natural Language Toolkit (NLTK) for Python
#
# Installation
#
# sudo pip install nltk
#
# import nltk
# nltk.download()

# Steps in text processing includes:
#     * Noise Removal
#     * Lexicon Normalization
#     * Object Standardization

# Raw text --> stopwords, hashtags, URI, etc removal --> tokenization, lemmatisation & stemming --> regex & table lookups --> cleaned text 


In [1]:
# Stopwords removal

noise_list = ["is", "a", "this", "..."] 
def _remove_noise(input_text):
    words = input_text.split() 
    noise_free_words = [word for word in words if word not in noise_list] 
    noise_free_text = " ".join(noise_free_words) 
    return noise_free_text

_remove_noise("this is a sample text")

'sample text'

In [2]:
# hashtag removal using regex

import re 

def _remove_regex(input_text, regex_pattern):
    urls = re.finditer(regex_pattern, input_text) 
    for i in urls: 
        input_text = re.sub(i.group().strip(), '', input_text)
    return input_text

regex_pattern = "#[\w]*"  

_remove_regex("remove this #hashtag please", regex_pattern)

'remove this  please'

In [5]:
# Stemming is a rudimentary rule-based process of stripping the suffixes (“ing”, “ly”, “es”, “s” etc) from a word.

# “play”, “player”, “played”, “plays” and “playing” are the different variations of the word – “play”, Though they mean different but contextually all are similar. The step converts all the disparities of a word into their normalized form (also known as lemma).

from nltk.stem.wordnet import WordNetLemmatizer 
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer 
stem = PorterStemmer()

word = "multiplying"

print('Lemma: ' + lem.lemmatize(word, "v"))
stem.stem(word)

Lemma: multiply


u'multipli'

In [9]:
# Object standardisation

lookup_dict = {'rt':'Retweet', 'dm':'direct message', "awsm" : "awesome", "luv" :"love"}
def _lookup_words(input_text):
    words = input_text.split() 
    new_words = [] 
    for word in words:
        if word.lower() in lookup_dict:
            word = lookup_dict[word.lower()]
        new_words.append(word) 
    new_text = " ".join(new_words) 
    return new_text

_lookup_words("RT this is a retweeted tweet by OCL")

'Retweet this is a retweeted tweet by OCL'

In [11]:
# Feature indentification & Part of Speech tagging

# "Please book my flight for Delhi"

# "I am going to read this book in the flight"

from nltk import word_tokenize, pos_tag

text = "I am learning Natural Language Processing with Suraj Jana"
tokens = word_tokenize(text)

print pos_tag(tokens)

[('I', 'PRP'), ('am', 'VBP'), ('learning', 'VBG'), ('Natural', 'NNP'), ('Language', 'NNP'), ('Processing', 'VBG'), ('with', 'IN'), ('Suraj', 'NNP'), ('Jana', 'NNP')]


In [13]:
# List of all pos_tags
import nltk

nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [14]:
# Levenshtein Distance

def levenshtein(s1,s2): 
    if len(s1) > len(s2):
        s1,s2 = s2,s1 
    distances = range(len(s1) + 1) 
    for index2,char2 in enumerate(s2):
        newDistances = [index2+1]
        for index1,char1 in enumerate(s1):
            if char1 == char2:
                newDistances.append(distances[index1]) 
            else:
                 newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1]))) 
        distances = newDistances 
    return distances[-1]

print(levenshtein("analyze","analyse"))

1


In [27]:
# NLTK NER Chunker

from nltk import word_tokenize, pos_tag, ne_chunk
 
sentence = "Mark and Rahul are working at OCL."
 
print ne_chunk(pos_tag(word_tokenize(sentence)))

(S
  (PERSON Mark/NNP)
  and/CC
  (PERSON Rahul/NNP)
  are/VBP
  working/VBG
  at/IN
  (ORGANIZATION OCL/NNP)
  ./.)


In [30]:
# N-grams

def generate_ngrams(text, n):
    words = text.split()
    output = []  
    for i in range(len(words)-n+1):
        output.append(words[i:i+n])
    return output

generate_ngrams('this is a sample text', 2)

[['this', 'is'], ['is', 'a'], ['a', 'sample'], ['sample', 'text']]