# Natural Language Processing

## Parts Of Speech (POS) tagging

In [None]:
pip install -U pip setuptools wheel

In [None]:
pip install -U spacy

In [None]:
pip install numpy==1.26.4, pandas==2.2.1, pydantic==2.7.4

In [None]:
import spacy
import pandas as pd

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
emma_ja = "emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an excellent woman as governess who had fallen little short of a mother in affection sixteen years had miss taylor been in mr woodhouses family less as a governess than a friend very fond of both daughters but particularly of emma between them it was more the intimacy of sisters even before miss taylor had ceased to hold the nominal office of governess the mildness of her temper had hardly allowed her to impose any restraint and the shadow of authority being now long passed away they had been living together as friend and friend very mutually attached and emma doing just what she liked highly esteeming miss taylors judgment but directed chiefly by her own"
print(emma_ja)

In [None]:
spacy_doc = nlp(emma_ja)

In [None]:
data = []
for token in spacy_doc:
    data.append({
        "token": token.text,
        "pos_tag": token.pos_
    })

pos_df = pd.DataFrame(data)

In [None]:
pos_df.head(15)

In [None]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [None]:
pos_df_counts.head(10)

In [None]:
pos_df_poscounts = pos_df_counts.groupby(['pos_tag'])['token'].count().sort_values(ascending=False)

In [None]:
pos_df_poscounts.head(10)

In [None]:
nouns = pos_df_counts[pos_df_counts.pos_tag == "NOUN"][:10]
nouns

In [None]:
adj = pos_df_counts[pos_df_counts.pos_tag == "ADJ"][:10]
adj

## Named Entity Recognition (NER)

In [None]:
import spacy
from spacy import displacy
from spacy import tokenizer
import re

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
google_text = "Google was founded on September 4, 1998, by American computer scientists Larry Page and Sergey Brin while they were PhD students at Stanford University in California. Together, they own about 14% of its publicly listed shares and control 56% of its stockholder voting power through super-voting stock. The company went public via an initial public offering (IPO) in 2004. In 2015, Google was reorganized as a wholly owned subsidiary of Alphabet Inc. Google is Alphabet's largest subsidiary and is a holding company for Alphabet's internet properties and interests. Sundar Pichai was appointed CEO of Google on October 24, 2015, replacing Larry Page, who became the CEO of Alphabet. On December 3, 2019, Pichai also became the CEO of Alphabet."
print(google_text)

In [None]:
spacy_doc = nlp(google_text)

In [None]:
for word in spacy_doc.ents:
    print(word.text, word.label_)

In [None]:
displacy.render(spacy_doc, style='ent', jupyter=True)

In [None]:
google_text_clean = re.sub(r'[^\w\s]', '', google_text).lower()
print(google_text_clean)

In [None]:
spacy_doc_clean = nlp(google_text_clean)

In [None]:
for word in spacy_doc_clean.ents:
    print(word.text, word.label_)

In [None]:
displacy.render(spacy_doc_clean, style='ent', jupyter=True)

## Real-Life Example

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
bbc_data = pd.read_csv('bbc_news.csv')

In [None]:
bbc_data.head()

In [None]:
bbc_data.info()

In [None]:
titles = pd.DataFrame(bbc_data['title'])

In [None]:
titles.head()

### Clean Data

In [None]:
#lowercase
titles['lowercase'] = titles['title'].str.lower()

In [None]:
#stop word removal
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in en_stopwords]))

In [None]:
#punctuation removal
titles['no_stopwords_no_punct'] = titles.apply(lambda x: re.sub(r'[^\w\s]', '', x['no_stopwords']), axis=1)

In [None]:
#tokenize
titles['tokens_raw'] = titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['tokens_clean'] = titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punct']), axis=1)

In [None]:
#lemmatizing
lemmatizer = WordNetLemmatizer()
titles['tokens_clean_lemmatized'] = titles['tokens_clean'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

In [None]:
titles.head()

In [None]:
#lists for our tokens
tokens_raw_list = sum(titles['tokens_raw'], [])
tokens_clean_list = sum(titles['tokens_clean_lemmatized'], [])

### POS Tagging

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
spacy_doc = nlp(' '.join(tokens_raw_list))

In [None]:
data = []
for token in spacy_doc:
    data.append({
        "token": token.text,
        "pos_tag": token.pos_
    })

pos_df = pd.DataFrame(data)

In [None]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)
pos_df_counts.head(10)

In [None]:
nouns = pos_df_counts[pos_df_counts.pos_tag == "NOUN"][:10]
nouns

In [None]:
verbs = pos_df_counts[pos_df_counts.pos_tag == "VERB"][:10]
verbs

In [None]:
adj = pos_df_counts[pos_df_counts.pos_tag == "ADJ"][:10]
adj

### Named Entity Recognition

In [None]:
data = []
for token in spacy_doc.ents:
    data.append({
        "token": token.text,
        "ner_tag": token.label_
    })

ner_df = pd.DataFrame(data)

In [None]:
ner_df.head()

In [None]:
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='counts').sort_values(by='counts', ascending=False)

In [None]:
ner_df_counts.head()

In [None]:
people = ner_df_counts[ner_df_counts.ner_tag == 'PERSON'][:10]
people