In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from spacy import displacy
import re
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML

# Load data

In [None]:
bbc_data = pd.read_csv('bbc_news.csv')
bbc_data.head()

In [None]:
bbc_data.info()

In [None]:
titles = pd.DataFrame(bbc_data['title'])
titles.head()

# Clean Data

In [None]:
# lowercase
titles['lowercase'] = titles['title'].str.lower()

# stopwords removal
en_stopwords = stopwords.words('english')
titles['no_stopwords'] = titles['lowercase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

# punctuation removal
titles['no_stopwords_no_punch'] = titles.apply(lambda row: re.sub(r'[^\w\s]', '', row['no_stopwords']), axis=1)

In [None]:
# tokenize
titles['tokens_raw'] = titles.apply(lambda x: word_tokenize(x['title']), axis=1)
titles['tokens_clean'] = titles.apply(lambda x: word_tokenize(x['no_stopwords_no_punch']), axis=1)

In [None]:
# lemmatizing
lemmatizer = WordNetLemmatizer()
titles['tokens_clean_lemmatized'] = titles['tokens_clean'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

titles.head()

In [None]:
# create list for just our tokens
tokens_raw_list = sum(titles['tokens_raw'], []) #unpack our lists into a single list
tokens_clean_list = sum(titles['tokens_clean_lemmatized'], [])

print(tokens_clean_list)

# POS Tagging
- Part-of-speech tagging is the process of assigning a part of speech to each word in a given text (such as nouns, verbs, adjectives, and others) based on its definition and its context.

In [None]:
npl = spacy.load("en_core_web_sm")
spacy_doc = npl(" ".join(tokens_clean_list))
pos_df = pd.DataFrame(columns = ['token', 'pos_tag'])
for token in spacy_doc:
    pos_df = pd.concat([pos_df, pd.DataFrame.from_records([(token.text, token.pos_)], columns = ['token', 'pos_tag'])], ignore_index=True)
    
pos_df.head(10)

In [None]:
pos_df_counts = pos_df.groupby(['token', 'pos_tag']).size().reset_index(name='count').sort_values(by='count', ascending=False)
pos_df_counts.head(10)

In [None]:
nouns = pos_df_counts[pos_df_counts['pos_tag'] == 'NOUN']
nouns.head(10)

In [None]:
adjectives = pos_df_counts[pos_df_counts['pos_tag'] == 'ADJ']
adjectives.head(10)

In [None]:
verbs = pos_df_counts[pos_df_counts['pos_tag'] == 'VERB']
verbs.head(10)

# NER - Named Entity Recognition
- Named Entity Recognition (NER) is a subtask of information extraction that locates and classifies named entities mentioned in unstructured text into pre-defined categories such as the person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.

In [None]:
ner_df = pd.DataFrame(columns = ['token', 'ner_tag'])
for token in spacy_doc.ents:
    if pd.isna(token.label_)is False:
        ner_df = pd.concat([ner_df, pd.DataFrame.from_records([{ 'token': token.text, 'ner_tag': token.label_ }])], ignore_index=True)

In [None]:
ner_df.head(10)

In [None]:
# get most common named entities
ner_df_counts = ner_df.groupby(['token', 'ner_tag']).size().reset_index(name='count').sort_values(by='count', ascending=False)
ner_df_counts.head(10)

In [None]:
people = ner_df_counts[ner_df_counts['ner_tag'] == 'PERSON']
people.head(10)

In [None]:
svg = displacy.render(spacy_doc, style='ent', jupyter=False)
display(HTML(svg))