In [2]:
# ============================================
# STEP 1 — LOAD LIBRARIES & NLP MODEL
# ============================================

import spacy
import pandas as pd

In [3]:
# Load SpaCy English model

nlp = spacy.load('en_core_web_sm')

In [4]:
# ============================================
# STEP 2 — RAW TEXT INPUT
# (text được lấy từ Jane Austen — đã lowercase và bỏ punctuation)
# ============================================ 

emma_ja =('emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an excellent woman as governess who had fallen little short of a mother in affection sixteen years had miss taylor been in mr woodhouses family less as a governess than a friend very fond of both daughters but particularly of emma between them it was more the intimacy of sisters even before miss taylor had ceased to hold the nominal office of governess the mildness of her temper had hardly allowed her to impose any restraint and the shadow of authority being now long passed away they had been living together as friend and friend very mutually attached and emma doing just what she liked highly esteeming miss taylors judgment but directed chiefly by her own')
print(emma_ja)

emma woodhouse handsome clever and rich with a comfortable home and happy disposition seemed to unite some of the best blessings of existence and had lived nearly twentyone years in the world with very little to distress or vex her she was the youngest of the two daughters of a most affectionate indulgent father and had in consequence of her sisters marriage been mistress of his house from a very early period her mother had died too long ago for her to have more than an indistinct remembrance of her caresses and her place had been supplied by an excellent woman as governess who had fallen little short of a mother in affection sixteen years had miss taylor been in mr woodhouses family less as a governess than a friend very fond of both daughters but particularly of emma between them it was more the intimacy of sisters even before miss taylor had ceased to hold the nominal office of governess the mildness of her temper had hardly allowed her to impose any restraint and the shadow of auth

In [31]:
# ============================================
# STEP 3 — RUN THE NLP PIPELINE (TOKENIZATION + POS TAGGING)
# ============================================

spacy_doc = nlp(emma_ja)

#View token + POS + lemma + tag details
for t in spacy_doc[:10]:
    print(t.text, t.lemma_, t.pos_, t.tag_)    # With _ → string → human reader 
                                               # Without __ → number → machine used

emma emma PROPN NNP
woodhouse woodhouse PROPN NNP
handsome handsome ADJ JJ
clever clever ADJ JJ
and and CCONJ CC
rich rich ADJ JJ
with with ADP IN
a a DET DT
comfortable comfortable ADJ JJ
home home NOUN NN


In [32]:
# ============================================
# STEP 4 — BUILD A DATAFRAME OF TOKEN + POS TAG
# (Approach 2 — FASTEST + MOST READABLE)
# ============================================

# pos_df now looks like:
# token     pos_tag
# emma      PROPN
# woodhouse PROPN
# handsome  ADJ
# clever    ADJ
# and       CCONJ
# ...

rows = [{'token': token.text, 'pos_tag': token.pos_} for token in spacy_doc]
pos_df = pd.DataFrame(rows)
pos_df.head(10)

Unnamed: 0,token,pos_tag
0,emma,PROPN
1,woodhouse,PROPN
2,handsome,ADJ
3,clever,ADJ
4,and,CCONJ
5,rich,ADJ
6,with,ADP
7,a,DET
8,comfortable,ADJ
9,home,NOUN


In [19]:
# ============================================
# STEP 5 — COUNT UNIQUE TOKENS PER POS
# (Approach 2 — value_counts is faster)
# ============================================

pos_df_counts = (
    pos_df
        .value_counts(['token', 'pos_tag'])        # → counts per token+POS (creates MultiIndex)
        .reset_index(name='counts')               # → convert MultiIndex → DataFrame
        .sort_values('counts', ascending=False)   # → sort highest freq first
)

pos_df_counts.head(10)

# pos_df_counts now contains:
# token      pos_tag     counts
# of         ADP         14
# her        PRON        9
# had        AUX         9
# ...

Unnamed: 0,token,pos_tag,counts
0,of,ADP,14
2,her,PRON,9
1,had,AUX,9
3,the,DET,8
4,and,CCONJ,8
5,a,DET,6
6,to,PART,5
7,in,ADP,4
8,been,AUX,4
9,very,ADV,4


In [20]:
# ============================================
# STEP 6 — COUNT HOW MANY UNIQUE WORDS BELONG TO EACH POS
# ============================================

pos_df_poscounts = (
    pos_df_counts['pos_tag']
        .value_counts()                   # → counts how many unique tokens per POS
        .sort_values(ascending=False)
)

pos_df_poscounts

# This gives output like:
# NOUN    35
# VERB    19
# ADJ     18
# ADV     18
# ...


pos_tag
NOUN     35
VERB     19
ADV      18
ADJ      18
PRON      9
ADP       8
PROPN     6
DET       5
AUX       4
CCONJ     3
SCONJ     3
NUM       3
PART      1
Name: count, dtype: int64

In [21]:
# ============================================
# STEP 7 — FILTER TOP NOUNS (OR ANY POS)
# ============================================

top_nouns = (
    pos_df_counts
        [pos_df_counts['pos_tag'] == 'NOUN']   # → filter nouns only
        .head(10)                              # → take top 10 by frequency
)

top_nouns 

# Result:
# governess   NOUN   3
# friends     NOUN   3
# mother      NOUN   2
# daughters   NOUN   2
# ...

Unnamed: 0,token,pos_tag,counts
11,friend,NOUN,3
10,governess,NOUN,3
23,mother,NOUN,2
19,sisters,NOUN,2
14,years,NOUN,2
15,emma,NOUN,2
17,daughters,NOUN,2
95,authority,NOUN,1
88,blessings,NOUN,1
87,caresses,NOUN,1
