## **Install required libraries**


In [None]:
!pip install spacy
!pip install contractions
!pip install nltk

In [None]:
!python -m spacy download en_core_web_sm

## **Import required libraries**


In [None]:
import re
import spacy
import nltk
import contractions
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import ngrams
from nltk.corpus import stopwords



## **Case Folding**


In [None]:
txt = "Segment routing works either on top of a MPLS network or on an IPv6 network."
print(txt)

: 

In [None]:
x = txt.casefold()
print(x)

# **Special Character Removal**


In [None]:

#input string
input_str = "hello how are you $$*doing?"

#using regular expressions to remove spetial characters
clear_str = re.sub(r"[^a-zA-Z0-9\s]", "", input_str)

print(clear_str)

hello how are you doing


In [None]:
#Labraries in the field of NLP

nlp = spacy.load("en_core_web_sm")

#Input string
input_str = "hello how are you $$*doing?"

#Function to clean the string
def clean_text(text):
  cleaned_text = ''.join(char for char in text if char.isalpha() or char.isspace())
  doc = nlp(cleaned_text)
  return ' '.join(token.text for token in doc)

# Get the final output
clean_str = clean_text(input_str)
print(clean_str)


hello how are you doing


In [None]:

nltk.download("punkt_tab")

#Input string
input_str = "hello how are you $$*doing?"

#Tokenize
tokens = nltk.word_tokenize(input_str)
print(tokens)

# Remove the special characters
clean_tokens = [token for token in tokens if token.isalnum()]

clean_str = ' '.join(clean_tokens)
print(clean_str)


['hello', 'how', 'are', 'you', '$', '$', '*', 'doing', '?']
hello how are you doing


[nltk_data] Downloading package punkt_tab to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


# **Handling contractions**


In [None]:

txt = "I can't go to walk it's raining. I haven't found what I'm looking for"

expanded_txt = contractions.fix(txt)

print(expanded_txt)

I cannot go to walk it is raining. I have not found what I am looking for


In [None]:

def expand_contractions(text):
  contractions_pattern = {
      r"(?i)can't": "cannot",
      r"(?i)won't": "will not",
      r"(?i)it's": "it is",
      r"(?i)weren't": "were not",
      r"(?i)I'm": "I am",
      r"(?i)haven't": "have not"
  }

  for contractions, expansion in contractions_pattern.items():
    text = re.sub(contractions, expansion, text)

  return text


txt = "I can't go to walk it's raining. I haven't found what I'm looking for"
expanded_text = expand_contractions(txt)
print(expanded_text)

I cannot go to walk it is raining. I have not found what I am looking for


# **TOKENIZATION**


In [None]:

#sample text for tokenization
txt = "BGP used for routing within an autonomous system is called Interior Border Gateway Protocol (iBGP). In contrast, the Internet application of the protocol is called Exterior Border Gateway Protocol (EBGP)."

# Word tokenization
words = word_tokenize(txt)
print(len(words))
print(words)

# Sentence tokenization

sent = sent_tokenize(txt)
print(len(sent))
print(sent)



37
['BGP', 'used', 'for', 'routing', 'within', 'an', 'autonomous', 'system', 'is', 'called', 'Interior', 'Border', 'Gateway', 'Protocol', '(', 'iBGP', ')', '.', 'In', 'contrast', ',', 'the', 'Internet', 'application', 'of', 'the', 'protocol', 'is', 'called', 'Exterior', 'Border', 'Gateway', 'Protocol', '(', 'EBGP', ')', '.']
2
['BGP used for routing within an autonomous system is called Interior Border Gateway Protocol (iBGP).', 'In contrast, the Internet application of the protocol is called Exterior Border Gateway Protocol (EBGP).']


# **Stop words removal**


In [None]:
#sample sentence:
sentence = "This is a sample sentence, showing off the stop words filtration"

In [None]:
# Tokenize the sentence
nltk.download('stopwords')
words = word_tokenize(sentence)

# Filter stop words
new_sentence = [word for word in words if word.lower() not in stopwords.words('english')]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(sentence)
print(new_sentence)

This is a sample sentence, showing off the stop words filtration
['sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration']


#**N-Grams**


In [None]:

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def generate_ngrams(text, n):
  tokens = word_tokenize(text)
  n_gram = list(ngrams(tokens, n))
  return n_gram

txt = "SRv6 is replacing MPLS"

unigrams = generate_ngrams(txt, 1)
bigrams = generate_ngrams(txt, 2)
trigrams = generate_ngrams(txt, 3)

print(unigrams)
print(bigrams)
print(trigrams)

[('SRv6',), ('is',), ('replacing',), ('MPLS',)]
[('SRv6', 'is'), ('is', 'replacing'), ('replacing', 'MPLS')]
[('SRv6', 'is', 'replacing'), ('is', 'replacing', 'MPLS')]
