<a href="https://colab.research.google.com/github/nabuulek/Natural-language-processing/blob/main/NLP_Basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Part One

This first session will cover nltk library to handle pronunciation, stemmatization, lemmatization, part of speech tagging, amd any natural language tasks.

In [1]:
import nltk #calling upon the nltk library
#download packages for tokenization, lemmatization, pronounciation, and POS tagging
nltk.download('punkt') #for tokenization
nltk.download('wordnet') #for lemmatization
nltk.download('cmudict') #for pronounciation
nltk.download('averaged_perceptron_tagger') #for POS tagging

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import cmudict

In [3]:
text = "When I grow older, I will be stronger"

In [4]:
#initialize an nltk lemmatizer
lemmatizer = WordNetLemmatizer()

In [5]:
#mophology analysis
def morphology_analysis(text):
  tokens=word_tokenize(text)
  lemmas=[lemmatizer.lemmatize(token) for token in tokens]
  return tokens, lemmas

In [6]:
tokens, lemmas = morphology_analysis(text)
print(f"Tokens:{tokens}")
print(f"Lemmas: {lemmas}")

Tokens:['When', 'I', 'grow', 'older', ',', 'I', 'will', 'be', 'stronger']
Lemmas: ['When', 'I', 'grow', 'older', ',', 'I', 'will', 'be', 'stronger']


In [7]:
#pronunciation with the cmu dictionary
pronouncing_dict = cmudict.dict()
#start by defining a function get_pronounciation
def get_pronounciation(word):
  try:
    return pronouncing_dict[word.lower()]#returns the pronounciations after converting the word to small letters
  except KeyError:
    return['No pronounciation found']

#pronounciation modelling
word = "Hello"
print(f"Pronounciation of '{word}' is:{get_pronounciation(word)}")

Pronounciation of 'Hello' is:[['HH', 'AH0', 'L', 'OW1'], ['HH', 'EH0', 'L', 'OW1']]


In [8]:
#part of speech tagging using nltk
def pos_tagging(text):
  tokens = word_tokenize(text)
  pos_tags = pos_tag(tokens)
  return pos_tags

In [9]:
text = "The students of Refactory in the AI class seemed to be bored."
pos_tags = pos_tagging(text)
print(f"POS Tags: {pos_tags}")

POS Tags: [('The', 'DT'), ('students', 'NNS'), ('of', 'IN'), ('Refactory', 'NNP'), ('in', 'IN'), ('the', 'DT'), ('AI', 'NNP'), ('class', 'NN'), ('seemed', 'VBD'), ('to', 'TO'), ('be', 'VB'), ('bored', 'VBN'), ('.', '.')]


In [13]:
#lets downlaod the latest version of spacy
!pip install spacy



In [15]:
#downloading from the terminal from CLI
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [16]:
import spacy
from spacy import displacy #for visuals in spacy

In [18]:
#syntactic parsing using spacy
nlp = spacy.load("en_core_web_sm")
#start by defining a fuction
def syntactic_parsing(text):
  doc = nlp(text) #converting text into one sentence that can be handled in nlp
  #the syntactic parse tree
  displacy.render(doc, style ='dep', jupyter=True, options ={'distance':90})
  return doc

In [19]:
doc = syntactic_parsing(text)