# Session 3 | Introduction to `SpaCy`

In [43]:
import os
import spacy
nlp = spacy.load("en_core_web_md")

In [9]:
input_string = "My name is Ross and I have family in New York City."

In [11]:
doc = nlp(input_string)
print(doc)

My name is Ross and I have family in New York City.


## Tokenizing text using `SpaCy``

In [12]:
for token in doc:
    print(token.text)

My
name
is
Ross
and
I
have
family
in
New
York
City
.


In [14]:
for token in doc:
    print(token.i, token.text, token.tag_, token.pos_)


0 My PRP$ PRON
1 name NN NOUN
2 is VBZ AUX
3 Ross NNP PROPN
4 and CC CCONJ
5 I PRP PRON
6 have VBP VERB
7 family NN NOUN
8 in IN ADP
9 New NNP PROPN
10 York NNP PROPN
11 City NNP PROPN
12 . . PUNCT


In [15]:
for ent in doc.ents:
    print(ent.text, ent.label)

Ross 380
New York City 384


## Count distribution of linguistic features

In [49]:
# Define filepath
filename = os.path.join(
    "..",
    "..",
    "..",
    "cds-lang-data",
    "USEcorpus",
    "USEcorpus",
    "a1",
    "0100.a1.txt"
)

In [50]:
# Load text file
with open(filename, "r", encoding="utf-8") as file:
    text = file.read()

In [51]:
doc = nlp(text)

In [58]:
entities = []
for ent in doc.ents:
    entities.append(ent.text)

In [64]:
print(set(entities))

{'Brittish English', 'American', 'US', 'today', 'Eight years ago', 'English', 'four', 'the years', 'my days', 'two years'}


In [67]:
adjective_count = 0
for token in doc:
    if token.pos_ == "ADJ":
        adjective_count += 1

print(adjective_count)

57


In [73]:
rel_freq = (adjective_count/len(doc)) * 10000
round(rel_freq, 2)

794.98

### Using `pandas` with `spaCy`

In [75]:
import pandas as pd

In [76]:
doc = nlp(input_string)

In [84]:
# create new list & append doc items (tokens)
annotations = []
for token in doc:
    annotations.append((token.text, token.pos_))

annotations

[('My', 'PRON'),
 ('name', 'NOUN'),
 ('is', 'AUX'),
 ('Ross', 'PROPN'),
 ('and', 'CCONJ'),
 ('I', 'PRON'),
 ('have', 'VERB'),
 ('family', 'NOUN'),
 ('in', 'ADP'),
 ('New', 'PROPN'),
 ('York', 'PROPN'),
 ('City', 'PROPN'),
 ('.', 'PUNCT')]

In [81]:
# convert annotations list to pandas dataframe
data = pd.DataFrame(annotations, columns=["Tokens", "POS"])
data

Unnamed: 0,Tokens,POS
0,My,PRON
1,name,NOUN
2,is,AUX
3,Ross,PROPN
4,and,CCONJ
5,I,PRON
6,have,VERB
7,family,NOUN
8,in,ADP
9,New,PROPN


In [87]:
# save dataframe to output dir
output_dir = os.path.join(
    "..",
    "dataframes",
    "annotations.csv"
)
data.to_csv(output_dir)