### Lexical Preprocessing

#### Regular Expression

In [None]:
import re

In [None]:
match=re.search("is","Pawan is learning ML which is a good thing")

In [None]:
print(match)

In [None]:
def str_pattern(text, pattern):
    if re.search(pattern, text):
        return re.search(pattern,text).group()
    else:
        return "not found"

In [None]:
# "*" means Zero or more
# "?" means zero or one
# "+" means one or more
# "{m,n}" means matchs atleast m times and atmost n times
#Anchors
# "^" starting of the string
# "$" end of the string
# wildcard
# "." any charachter

### Meta sequences

| Pattern  | Equivalent to    |
|----------|------------------|
| \s       | [ \t\n\r\f\v]    |
| \S       | [^ \t\n\r\f\v]   |
| \d       | [0-9]            |
| \D       | [^0-9]           |
| \w       | [a-zA-Z0-9_]     |
| \W       | [^a-zA-Z0-9_]

In [None]:
print(str_pattern("1", "\d")) #^ neither of these theree

match will look in start of the staring else return not found

finall() Find all the substrings where the RE matches, and return them as a list

finditer() Find all substrings where RE matches and return them as asn iterator

sub() Find all substrings where the RE matches and substitute them with the given string

To summarise, the Zipf's law (discovered by the linguist-statistician George Zipf) states
that the frequency of a word is inversely proportional to the rank of the word, where rank 1 is given to the most frequent 
word, 2 to the second most frequent and so on.This is also called the power law distribution.

#### Removing Stopwords

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('wordnet')

In [None]:
stopwords.words('english')

In [None]:
text = "Need to create a universal summarization of text which IS very innovative project for us"

In [None]:
words = text.lower().split()

In [None]:
print(words)

In [None]:
sample_text = [word for word in words if word not in stopwords.words('english')]
print(sample_text)

#### Tokenization

This technique is called tokenisation - a technique that’s used to split the text into smaller elements. 
These elements can be characters, words, sentences, or even paragraphs depending on the application you’re working on.

In [None]:
#word tokenization
document = "At nine o'clock I visited him myself. It looks like religious mania, and he'll soon think that he himself is God."
print(document)

In [None]:
from nltk.tokenize import word_tokenize
word = word_tokenize(document)
print(word)

In [None]:
#sentence tokenization
from nltk.tokenize import sent_tokenize
sentence = sent_tokenize(document)
print(sentence)

In [None]:
#tweet tokenization
message = "i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎"
from nltk.tokenize import TweetTokenizer
tweet_words = TweetTokenizer()
print(tweet_words.tokenize(message))

In [None]:
#regex tokenization
message = "i recently watched this show called mindhunters:). i totally loved it 😍. it was gr8 <3. #bingewatching #nothingtodo 😎"
from nltk.tokenize import regexp_tokenize
pattern = "#[\w]+"
regexp_tokenize(message, pattern)

#### Bag of word model

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
documents = ["Gangs of Wasseypur is a great movie.", "The success of a movie depends on the performance of the actors.", "There are no new movies releasing this week."]
print(documents)

In [None]:
def preprocessing(document):
    document = document.lower()
    document = word_tokenize(document)
    words =  [word for word in document if word not in stopwords.words('english')]
    document = ' '.join(words)
    return document

In [None]:
document =  [preprocessing(document) for document in documents]

In [None]:
print(document)

In [None]:
counter_var  = CountVectorizer()
bow_model  = counter_var.fit_transform(document)
print(bow_model.toarray())

#### Stemming and Lemmatization

In [None]:
#stemming  -- rule bases
from nltk.stem.porter import PorterStemmer

In [None]:
text = "Very orderly and methodical he looked, with a hand on each knee, and a loud watch ticking a sonorous sermon under his flapped newly bought waist-coat, as though it pitted its gravity and longevity against the levity and evanescence of the brisk fire."
print(text)

In [None]:
texts = word_tokenize(text.lower())

In [None]:
stemmer = PorterStemmer()
stemmer_word = [stemmer.stem(text) for text in texts]

In [None]:
print(stemmer_word)

In [None]:
#lemmatization -- not rule based POS tagging is incorrect than go for stemming, pass pos tag of each word with
#lemmatiozation for better results
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()
lemma_words = [lemma.lemmatize(token) for token in texts]

In [None]:
print(lemma_words)

#### TF-IDF Model (TF is term freq and IDF is inverse doc freq)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 

In [None]:
from nltk.stem.porter import PorterStemmer
def docpreprcess(document):
    document = document.lower()
    words = word_tokenize(document)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in words if word not in stopwords.words('english')]
    document = " ".join(tokens)
    return document

In [None]:
documents = ["Vapour, Bangalore has a really great terrace seating and an awesome view of the Bangalore skyline",
             "The beer at Vapour, Bangalore was amazing. My favorites are the wheat beer and the ale beer.",
             "Vapour, Bangalore has the best view in Bangalore."]
print(documents)

In [None]:
document =  [docpreprcess(document) for document in documents]

In [None]:
print(document)

In [None]:
tfidf = TfidfVectorizer()
tfidfmodel = tfidf.fit_transform(document)

In [None]:
pd.DataFrame(tfidfmodel.toarray(), columns=tfidf.get_feature_names())

#### Canonicalisation

In [None]:
#Soundex Algorithm

In [None]:
#Edit Distince Algo
from nltk.metrics.distance import edit_distance
edit_distance ("advise" , "advice", transpositions=False,)

### Syntactical Analysis

In [None]:
import nltk
nltk.download()

Different levels of syntactical analysis:
~Part-of-speech tagging
    ~Lexicon Tagger(unigram)
    ~Rule based Tagger
    ~Sochastic tagger
    ~Deep learning tagger
~Constituency parsing
~Dependency parsing

In [None]:
# Lexicon and Rule based tagger
import nltk
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

In [None]:
# reading the Treebank tagged sentences
wsj = list(nltk.corpus.treebank.tagged_sents())

In [None]:
tagged_word = [tup for sent in wsj for tup in sent]

In [None]:
print(len(tagged_word))

In [None]:
random.seed(123)
train, test = train_test_split(wsj, test_size=0.3)

In [None]:
print(len(train))

In [None]:
print(len(test))

In [None]:
#Rule bases tagger
# specify patterns for tagging
# example from the NLTK book
patterns = [
    (r'.*ing$', 'VBG'),              # gerund
    (r'.*ed$', 'VBD'),               # past tense
    (r'.*es$', 'VBZ'),               # 3rd singular present
    (r'.*ould$', 'MD'),              # modals
    (r'.*\'s$', 'NN$'),              # possessive nouns
    (r'.*s$', 'NNS'),                # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
    (r'.*', 'NN')                    # nouns
]

In [None]:
regexp_tagger = nltk.RegexpTagger(patterns)

In [None]:
regexp_tagger.evaluate(test)

In [None]:
# rule based tagger
rule_based_tagger = nltk.RegexpTagger(patterns)
# lexicon backed up by the rule-based tagger
lexicon_tagger = nltk.UnigramTagger(train, backoff=rule_based_tagger)
lexicon_tagger.evaluate(test)

~Stochastic models such as HMMs Model(Hidden Markov Model)
the two main problems in building an HMM for POS tagging  - the learning problem (learning the probabilities) and the explanation problem (solved using the Viterbi algorithm)

~Deep-learning, RNN-based models

Parsing 
~Constituency Parsing 

Context-Free Grammars (CFG)

In [None]:
#Top Down Parsing ( recursive descent parser)-- We should avoid the problem of left-recursion VP -> VP NP| V

In [None]:
#Specification of CFG
import nltk 

grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det N | Det N PP
VP -> V | V NP | V NP PP
PP -> P NP

Det -> 'a' | 'an' | 'the'
N -> 'man' | 'park' | 'dog' | 'telescope'
V -> 'saw' | 'walked'
P -> 'in' | 'with'
""")

In [None]:
str = "the man saw a dog in the park with a telescope"

In [None]:
from nltk.parse import RecursiveDescentParser

In [None]:
#Using a top-down parser
rdstr = RecursiveDescentParser(grammar)

In [None]:
for tree in rdstr.parse(str.split()):
    print(tree)

In [None]:
nltk.app.rdparser()

In [None]:
#Bottom up Parsing
str = "the man saw a dog in the park with a telescope"
from nltk.parse import ShiftReduceParser
grammar = nltk.CFG.fromstring("""
S -> NP VP
NP -> Det N | Det N PP
VP -> V | V NP | V NP PP
PP -> P NP

Det -> 'a' | 'an' | 'the'
N -> 'man' | 'park' | 'dog' | 'telescope'
V -> 'saw' | 'walked'
P -> 'in' | 'with'
""")
shift = ShiftReduceParser(grammar)

In [None]:
for t in shift.parse(str.split()):
    print(t)

In [None]:
nltk.app.srparser()

Two topic leaned 1. PCFG and 2.Chomsky Normal Form(CNF) and little bit about Dependency parser(it is used for Free word language)

#### Information Extraction

To summarise, a generic IE pipeline is as follows:

Preprocessing

Sentence Tokenization: sequence segmentation of text.

Word Tokenization: breaks down sentences into tokens

POS tagging - assigning POS tags to the tokens. The POS tags can be helpful in defining what words could form an entity.

Entity Recognition

Rule-based models

Probabilistic models

In entity recognition, every token is tagged with an IOB label and then nearby tokens are combined together basis their labels.

Relation Recognition is the task of identifying relationships between the named entities. Using entity recognition, we can identify places (pl), organisations (o), persons (p). Relation recognition will find the relation between (pl,o), such that o is located in pl. Or between (o,p), such that p is working in o, etc.
Record Linkage refers to the task of linking two or more records that belong to the same entity. For example, Bangalore and Bengaluru refer to the same entity.

Conditonal Random Fields needs to be used for NER or POS tagging, its a better approach then above

### Symentic Processing

We use Lesk Algorithm for the unsuperwisend word sense disambiguation. It uses wordnet to find out the sense of confusing word like duck or bank

Word2vec

In [38]:
import nltk   
import numpy as np
from gensim.models import Word2Vec 
from gensim.models import word2vec #need to use '1.19.2' version of numpy
import matplotlib.pyplot as plt
import os

In [10]:
TextCorpus  = ["I like Upgrad",
               "Upgrad has a good ML program",
               "Upgrad has good faculty",
               "Rahim is that good faculty",
               "I like ML"
]

In [13]:
text_tokens = [sent.split() for sent in TextCorpus]
print(text_tokens[:2])

[['I', 'like', 'Upgrad'], ['Upgrad', 'has', 'a', 'good', 'ML', 'program']]


In [17]:
model = Word2Vec(text_tokens,min_count=1)

In [21]:
model.wv['ML']

array([-8.2426788e-03,  9.2993546e-03, -1.9766092e-04, -1.9672776e-03,
        4.6036290e-03, -4.0953159e-03,  2.7431131e-03,  6.9399667e-03,
        6.0654259e-03, -7.5107957e-03,  9.3823504e-03,  4.6718074e-03,
        3.9661191e-03, -6.2435055e-03,  8.4599778e-03, -2.1501661e-03,
        8.8251876e-03, -5.3620026e-03, -8.1294207e-03,  6.8245577e-03,
        1.6711927e-03, -2.1985101e-03,  9.5135998e-03,  9.4938539e-03,
       -9.7740479e-03,  2.5052286e-03,  6.1566923e-03,  3.8724565e-03,
        2.0227861e-03,  4.3050051e-04,  6.7363022e-04, -3.8206363e-03,
       -7.1402504e-03, -2.0888734e-03,  3.9238976e-03,  8.8186832e-03,
        9.2591504e-03, -5.9759379e-03, -9.4026709e-03,  9.7643761e-03,
        3.4297847e-03,  5.1661157e-03,  6.2823440e-03, -2.8042626e-03,
        7.3227026e-03,  2.8302716e-03,  2.8710032e-03, -2.3803711e-03,
       -3.1282497e-03, -2.3701428e-03,  4.2764354e-03,  7.6057913e-05,
       -9.5842788e-03, -9.6655441e-03, -6.1481954e-03, -1.2856961e-04,
      

In [22]:
len(model.wv['ML'])

100

In [23]:
model.wv.most_similar("ML",topn=5)

[('like', 0.13149002194404602),
 ('is', 0.07497299462556839),
 ('Upgrad', 0.0679759532213211),
 ('I', 0.04157735034823418),
 ('a', 0.04130808264017105)]

In [43]:
sentences = word2vec.Text8Corpus('C:/Users/pawasharma/Desktop/Python/NLP/text8')

In [44]:
sentence

<gensim.models.word2vec.Text8Corpus at 0x1cc66be3788>

In [45]:
model = Word2Vec(sentences)

In [46]:
model.wv.most_similar("happiness")

[('humanity', 0.7819305658340454),
 ('pleasure', 0.7682881355285645),
 ('perfection', 0.7613503336906433),
 ('compassion', 0.7477684617042542),
 ('goodness', 0.7453301548957825),
 ('dignity', 0.7431254386901855),
 ('fear', 0.7417725324630737),
 ('mankind', 0.7215771079063416),
 ('righteousness', 0.7204358577728271),
 ('desires', 0.7135218381881714)]