<a href="https://colab.research.google.com/github/michaellaic/nlp2/blob/main/Copy_of_Text_Processing_and_Feature_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.utils.extmath import randomized_svd

import re


nltk.download('punkt')
nltk.download('wordnet')
!python -m nltk.downloader stopwords


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Text Processing

## Create Corpus

In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/Neuro-linguistic_programming'

response = requests.get(url)
soup_text = ''
if response.status_code == 200:
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract all text from paragraph tags
    paragraphs = soup.find_all('p')
    for para in paragraphs:
        soup_text += para.get_text()
        #print(para.get_text())
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

corpus = soup_text

In [None]:
corpus = corpus.replace('\n', ' ')
corpus = corpus.lower()

pattern = r'\[\d+\]'
corpus = re.sub(pattern, '', corpus)

## Tokenization

In [None]:
tokens = nltk.word_tokenize(corpus)
stopwords = nltk.corpus.stopwords.words('english')
filtered_tokens = [token.lower() for token in tokens if token.lower() not in stopwords and token.isalpha()]
filtered_tokens

['programming',
 'nlp',
 'pseudoscientific',
 'approach',
 'communication',
 'personal',
 'development',
 'psychotherapy',
 'first',
 'appeared',
 'richard',
 'bandler',
 'john',
 'grinder',
 'book',
 'structure',
 'magic',
 'nlp',
 'asserts',
 'connection',
 'neurological',
 'processes',
 'language',
 'acquired',
 'behavioral',
 'patterns',
 'changed',
 'achieve',
 'specific',
 'goals',
 'life',
 'according',
 'bandler',
 'grinder',
 'nlp',
 'treat',
 'problems',
 'phobias',
 'depression',
 'tic',
 'disorders',
 'psychosomatic',
 'illnesses',
 'allergy',
 'common',
 'cold',
 'learning',
 'disorders',
 'often',
 'single',
 'session',
 'also',
 'say',
 'nlp',
 'model',
 'skills',
 'exceptional',
 'people',
 'allowing',
 'anyone',
 'acquire',
 'b',
 'nlp',
 'adopted',
 'hypnotherapists',
 'well',
 'companies',
 'run',
 'seminars',
 'marketed',
 'leadership',
 'training',
 'businesses',
 'government',
 'agencies',
 'scientific',
 'evidence',
 'supporting',
 'claims',
 'made',
 'nlp',
 'ad

## Normalization

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
lemmatized_tokens

['programming',
 'nlp',
 'pseudoscientific',
 'approach',
 'communication',
 'personal',
 'development',
 'psychotherapy',
 'first',
 'appeared',
 'richard',
 'bandler',
 'john',
 'grinder',
 'book',
 'structure',
 'magic',
 'nlp',
 'asserts',
 'connection',
 'neurological',
 'process',
 'language',
 'acquired',
 'behavioral',
 'pattern',
 'changed',
 'achieve',
 'specific',
 'goal',
 'life',
 'according',
 'bandler',
 'grinder',
 'nlp',
 'treat',
 'problem',
 'phobia',
 'depression',
 'tic',
 'disorder',
 'psychosomatic',
 'illness',
 'allergy',
 'common',
 'cold',
 'learning',
 'disorder',
 'often',
 'single',
 'session',
 'also',
 'say',
 'nlp',
 'model',
 'skill',
 'exceptional',
 'people',
 'allowing',
 'anyone',
 'acquire',
 'b',
 'nlp',
 'adopted',
 'hypnotherapists',
 'well',
 'company',
 'run',
 'seminar',
 'marketed',
 'leadership',
 'training',
 'business',
 'government',
 'agency',
 'scientific',
 'evidence',
 'supporting',
 'claim',
 'made',
 'nlp',
 'advocate',
 'called',

# Feature Extraction

## BOW

In [None]:
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform([corpus])
df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,00,000,10,150,1970s,1973,1975,1978,1979,1980,...,works,workshop,world,writes,writing,written,wrote,year,years,yields
0,1,5,2,1,1,1,2,1,1,2,...,3,2,3,5,2,1,1,1,2,1


## Prepare the corpus

In [None]:
pattern = r'[.,]'
corpus_segments = re.split(pattern, corpus)
corpus_segments = [segment.strip() for segment in corpus_segments]

In [None]:
data = []

for i in nltk.sent_tokenize(corpus):
    temp = []

    # tokenize the sentence into words
    for j in nltk.word_tokenize(i):
        temp.append(j.lower())

    data.append(temp)
data

[['neuro-linguistic',
  'programming',
  '(',
  'nlp',
  ')',
  'is',
  'a',
  'pseudoscientific',
  'approach',
  'to',
  'communication',
  ',',
  'personal',
  'development',
  'and',
  'psychotherapy',
  ',',
  'that',
  'first',
  'appeared',
  'in',
  'richard',
  'bandler',
  'and',
  'john',
  'grinder',
  "'s",
  '1975',
  'book',
  'the',
  'structure',
  'of',
  'magic',
  'i.',
  'nlp',
  'asserts',
  'that',
  'there',
  'is',
  'a',
  'connection',
  'between',
  'neurological',
  'processes',
  ',',
  'language',
  'and',
  'acquired',
  'behavioral',
  'patterns',
  ',',
  'and',
  'that',
  'these',
  'can',
  'be',
  'changed',
  'to',
  'achieve',
  'specific',
  'goals',
  'in',
  'life',
  '.'],
 ['according',
  'to',
  'bandler',
  'and',
  'grinder',
  ',',
  'nlp',
  'can',
  'treat',
  'problems',
  'such',
  'as',
  'phobias',
  ',',
  'depression',
  ',',
  'tic',
  'disorders',
  ',',
  'psychosomatic',
  'illnesses',
  ',',
  'near-sightedness',
  ',',
  '[

## TF - IDF

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
result = tfidf.fit_transform(corpus_segments)

In [None]:
for ele1, ele2 in zip(tfidf.get_feature_names_out() , tfidf.idf_):
    print(ele1, ':', ele2)

00 : 6.295814236329918
000 : 5.197201947661808
10 : 5.890349128221754
150 : 6.295814236329918
1970s : 6.295814236329918
1973 : 6.295814236329918
1975 : 5.890349128221754
1978 : 6.295814236329918
1979 : 6.295814236329918
1980 : 5.890349128221754
1980s : 6.295814236329918
1981 : 5.890349128221754
1990s : 6.295814236329918
1996 : 6.295814236329918
1997 : 5.890349128221754
200 : 6.295814236329918
2000 : 5.890349128221754
2009 : 6.295814236329918
2012 : 6.295814236329918
25 : 6.295814236329918
270 : 6.295814236329918
29 : 6.295814236329918
800 : 6.295814236329918
able : 5.602667055769973
abstracted : 6.295814236329918
academic : 5.890349128221754
accept : 6.295814236329918
accepting : 6.295814236329918
according : 4.504054767101863
account : 5.890349128221754
accounted : 6.295814236329918
achieve : 5.890349128221754
achieved : 5.890349128221754
achieving : 6.295814236329918
acknowledged : 5.890349128221754
acknowledging : 6.295814236329918
acquire : 6.295814236329918
acquired : 6.2958142363

## Word2Vec

In [None]:
# Import necessary libraries
import nltk
#nltk.download('brown')
from gensim.models import Word2Vec
from nltk.corpus import brown

# Define corpus
corpus_Word2Vec = [nltk.word_tokenize(sentence.lower()) for sentence in corpus_segments]
# Train the Word2Vec model
model = Word2Vec(
    sentences=corpus_Word2Vec,      # The corpus to train the model on
    vector_size=100,       # The size of the word vectors to be learned
    window=5,              # The size of the window of words to be considered
    min_count=5,           # The minimum frequency required for a word to be included in the vocabulary
    sg=0,                  # 0 for CBOW, 1 for skip-gram
    negative=5,            # The number of negative samples to use for negative sampling
    ns_exponent=0.75,      # The exponent used to shape the negative sampling distribution
    alpha=0.03,            # The initial learning rate
    min_alpha=0.0007,      # The minimum learning rate to which the learning rate will be linearly reduced
    epochs=30,             # The number of epochs (iterations) over the corpus
    workers=4,             # The number of worker threads to use for training the model
    seed=42,               # The seed for the random number generator
    max_vocab_size=None    # The maximum vocabulary size (None means no limit)
)


vector = model.wv['people']
similar_words = model.wv.most_similar('people')

print("Vector for 'people' :", vector)
print("Most similar words to 'people':" , similar_words)



Vector for 'people' : [ 0.10640279  0.03249435 -0.01005604  0.03142631 -0.07566714  0.02518857
 -0.04741451  0.07210536 -0.14970101  0.01185372 -0.24049033  0.16769168
  0.03406207  0.07235    -0.01241909 -0.06714013 -0.03837318 -0.26321366
 -0.12627652 -0.14316937 -0.07045165  0.11519354  0.16758814  0.16744709
  0.2292357  -0.07704052 -0.12368877 -0.00947078 -0.17789692 -0.03104203
 -0.12639113 -0.06135835 -0.00655372 -0.09305259 -0.00871388 -0.14353026
  0.151145   -0.27534023 -0.00113429  0.12998554  0.10613095 -0.04567363
  0.27419916 -0.16688547  0.04470463  0.01643938 -0.01240461  0.17220564
  0.28226513 -0.08200391  0.03176908 -0.22785439 -0.06171776 -0.0154609
  0.07836267 -0.04150826 -0.06011703  0.08165786 -0.02106999  0.17979898
  0.07739906  0.03865779 -0.13747647  0.075017    0.05929295  0.07425531
 -0.01674764 -0.1641821  -0.11889978 -0.02299377 -0.05291957 -0.0527339
  0.02173797 -0.08643277  0.02449337 -0.15815638  0.08866736 -0.13041236
  0.01065101 -0.06757936  0.009

## GloVe

GloVe (Global Vectors for Word Representation) is an unsupervised learning algorithm for obtaining vector representations for words. It is a type of word embedding that maps words into a high-dimensional space where the distance between words reflects their semantic similarity.

In [None]:
# download glove and unzip it in Notebook.
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2024-07-10 10:03:03--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-07-10 10:03:03--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-07-10 10:03:04--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
def load_glove_model(glove_file):
    print("Loading Glove Model")
    model = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            model[word] = embedding
    print("Done. {} words loaded!".format(len(model)))
    return model

glove_model = load_glove_model('glove.6B.100d.txt')


Loading Glove Model
Done. 400000 words loaded!


In [None]:
# code for Glove word embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

x = corpus.split()

# create the dict.
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)

# number of unique words in dict.
print("Number of unique words in dictionary=",
	len(x))
print("Dictionary is = ", x)


# vocab: 'the': 1, mapping of words with
# integers in seq. 1,2,3..
# embedding: 1->dense vector
def embedding_for_vocab(filepath, word_index,
						embedding_dim):
	vocab_size = len(word_index) + 1

	embedding_matrix_vocab = np.zeros((vocab_size,
									embedding_dim))

	with open(filepath, encoding="utf8") as f:
		for line in f:
			word, *vector = line.split()
			if word in word_index:
				idx = word_index[word]
				embedding_matrix_vocab[idx] = np.array(
					vector, dtype=np.float32)[:embedding_dim]

	return embedding_matrix_vocab


# matrix for vocab: word_index
embedding_dim = 50
embedding_matrix_vocab = embedding_for_vocab(
	'/content/glove.6B.50d.txt', tokenizer.word_index,
embedding_dim)

print("Dense vector for first word is => ",
	embedding_matrix_vocab[1])

Number of unique words in dictionary= 3566
Dictionary is =  ['neuro-linguistic', 'programming', '(nlp)', 'is', 'a', 'pseudoscientific', 'approach', 'to', 'communication,', 'personal', 'development', 'and', 'psychotherapy,', 'that', 'first', 'appeared', 'in', 'richard', 'bandler', 'and', 'john', "grinder's", '1975', 'book', 'the', 'structure', 'of', 'magic', 'i.', 'nlp', 'asserts', 'that', 'there', 'is', 'a', 'connection', 'between', 'neurological', 'processes,', 'language', 'and', 'acquired', 'behavioral', 'patterns,', 'and', 'that', 'these', 'can', 'be', 'changed', 'to', 'achieve', 'specific', 'goals', 'in', 'life.', 'according', 'to', 'bandler', 'and', 'grinder,', 'nlp', 'can', 'treat', 'problems', 'such', 'as', 'phobias,', 'depression,', 'tic', 'disorders,', 'psychosomatic', 'illnesses,', 'near-sightedness,[a]', 'allergy,', 'the', 'common', 'cold,[a]', 'and', 'learning', 'disorders,', 'often', 'in', 'a', 'single', 'session.', 'they', 'also', 'say', 'that', 'nlp', 'can', 'model', 'th

In [None]:
def cyk_parse(sentence, grammar):
    #s = len(sentence)
    #table = [[set() for _ in range(n+1)] for _ in range(n+1)]

    for s in sentence:
        # Step 1: Tokenization
        tokens = s.split()
        n = len(tokens)
        table = [[set() for _ in range(n+1)] for _ in range(n+1)]

        # Step 2: Initialization
        for i in range(1, n+1):
            for rule in grammar:
                if rule[1] == tokens[i-1]:
                    table[i][i].add(rule[0])

        # Step 3: Rule Application
        for length in range(2, n+1):
            for i in range(1, n-length+2):
                j = i + length - 1
                for k in range(i, j):
                    for rule in grammar:
                        if len(rule) == 3:
                            for left in table[i][k]:
                                for right in table[k+1][j]:
                                    if rule[1] in left and rule[2] in right:
                                        table[i][j].add(rule[0])

        # Step 4: Backtracking
        if 'S' in table[1][n]:
          print("Input sentence: ", s)
          print("Parse table: ")
          for row in table:
            print(row)
        else:
          print("Input sentence: ", s)
          print("Sentence not parsed.")


# Example usage:

# Define the context-free grammar in CNF
grammar = [
    ('S', 'NP', 'VP'),
    ('S', 'NP', 'NP'),
    ('S', 'S', 'S'),
    ('NP', 'Det', 'Noun'),
    ('NP', 'Det', 'Verb'),
    ('VP', 'Verb', 'NP'),
    ('Det', 'this'),
    ('Det', 'a'),
    ('Det', 'that'),
    ('Det', 'so'),
    ('Det', 'is'),
    ('Det', 'the'),
    ('Noun', 'dog'),
    ('Noun', 'cat'),
    ('Noun', 'pizza'),
    ('Noun', 'good'),
    ('Verb', 'chased'),
    ('Verb', 'tast'),
    ('Verb', 'ate')
]

# Input sentence to be parsed
sentences = [
    "this dog is good",
    "the dog ate the pizza",
    "the dog chased a cat",
    "the pizza tast so good",
    "the cat chased the dog that ate the pizza"
  ]

# Call the CYK parser
cyk_parse(sentences, grammar)

# Print the parse table and whether the sentence was parsed or not


Input sentence:  this dog is good
Parse table: 
[set(), set(), set(), set(), set()]
[set(), {'Det'}, {'NP'}, set(), {'S'}]
[set(), set(), {'Noun'}, set(), set()]
[set(), set(), set(), {'Det'}, {'NP'}]
[set(), set(), set(), set(), {'Noun'}]
Input sentence:  the dog ate the pizza
Parse table: 
[set(), set(), set(), set(), set(), set()]
[set(), {'Det'}, {'NP'}, set(), set(), {'S'}]
[set(), set(), {'Noun'}, set(), set(), set()]
[set(), set(), set(), {'Verb'}, set(), {'VP'}]
[set(), set(), set(), set(), {'Det'}, {'NP'}]
[set(), set(), set(), set(), set(), {'Noun'}]
Input sentence:  the dog chased a cat
Parse table: 
[set(), set(), set(), set(), set(), set()]
[set(), {'Det'}, {'NP'}, set(), set(), {'S'}]
[set(), set(), {'Noun'}, set(), set(), set()]
[set(), set(), set(), {'Verb'}, set(), {'VP'}]
[set(), set(), set(), set(), {'Det'}, {'NP'}]
[set(), set(), set(), set(), set(), {'Noun'}]
Input sentence:  the pizza tast so good
Parse table: 
[set(), set(), set(), set(), set(), set()]
[set(), {'