# PART A:  Linguistic analysis using spaCy 

In the first part of the assignment, we focus on an analysis of the sentences in the training data. 

In [1]:
import spacy
import pandas as pd
import string
from collections import Counter

nlp = spacy.load("en_core_web_sm")
punctuations = string.punctuation

In [2]:
df_file = open("../datas/sentences.txt", encoding='utf-8')
data = df_file.read()
# print(data)

In [3]:
doc = nlp(data) # nlp object is used to create documents with linguistic annotations
# print(doc)

### 1. Tokenization

In [4]:
def tokenization(text):
    
    token_list = []
    
    for token in text:
        token_list.append(token.text)
        
    return token_list

In [5]:
 # tokenization(doc)
"""
['children',
 'are',
 'thought',
 'to',
 'be',
 'aged',
 'three',...]
"""

"\n['children',\n'are',\n'thought',\n'to',\n'be',\n'aged',\n'three',...]\n"

In [6]:
def num_of_tokens(text):
    
    total_token = 0
    for token in text:
        total_token += 1
    
    return total_token

In [7]:
num_of_tokens(doc) # 16130 tokens

16130

In [8]:
def num_of_types(text):
    
    """
    Type is different from the number of actual occurrences which would be known as tokens.
    """
    
    token_list = set(tokenization(text))
    return len(token_list)

In [9]:
num_of_types(doc) # 3746 types

3746

In [10]:
def num_of_words(text):
    
    """
    A word is a speech sound or a combination of sounds, or its representation in writing, 
    that symbolizes and communicates a meaning and may consist of a single morpheme or a combination of morphemes.
    """
    
    total_words = 0
    words = []
    
    for token in text:
        if token.text not in punctuations and token.text != '\n':
            total_words += 1
            words.append(token.text)
    
    return total_words

In [11]:
def list_of_words(text):
    
    """
    A word is a speech sound or a combination of sounds, or its representation in writing, 
    that symbolizes and communicates a meaning and may consist of a single morpheme or a combination of morphemes.
    """
    
    total_words = 0
    words = []
    
    for token in text:
        if token.text not in punctuations and token.text != '\n':
            total_words += 1
            words.append(token.text)
    
    return words

In [12]:
# num_of_words(doc)# 13265 words
# list_of_words(doc)

In [13]:
def average_num_of_words_per_sentence(text):
    
    sentence_count = 0
    
    for sent in text.sents:
        sentence_count += 1


    avg_num_words = (num_of_words(text))  / sentence_count
    print(num_of_words(text),sentence_count)
    
    return round(avg_num_words,2) # 23.63 number of words per sentence

In [14]:
average_num_of_words_per_sentence(doc)

13265 718


18.47

In [15]:
def average_word_length(text):

    words_list = list_of_words(text)
    avg_num_words = 0
    
    avg_num_words = sum(len(word) for word in words_list) / len(words_list)            
    
    return round(avg_num_words,2) # 4.9 average word length

In [16]:
average_word_length(doc)

4.9

### 2. Word Classes 

Run the default part-of-speech tagger on the dataset and identify the ten most frequent `POS tags`. Complete the table for these ten tags.

In [17]:
print(f"TOKEN\tPOS\tTAG".format('Token','Lemma','Pos'))
print("-"*50)

for token in doc:
    #print(token.text, token.pos_, token.tag_)
    

    """

TOKEN    POS    TAG
--------------------------------------------------
children NOUN NNS
are AUX VBP
thought VERB VBN
to PART TO
be AUX VB
aged VERB VBN
    """

TOKEN	POS	TAG
--------------------------------------------------


In [18]:
tag_frequencies = Counter()

for sentence in doc.sents:
    tags = []
    for token in sentence: 
        #if not token.is_punct:
        tags.append(token.tag_)
    tag_frequencies.update(tags)
    
print(tag_frequencies)

Counter({'NN': 2074, 'NNP': 2063, 'IN': 1745, 'DT': 1378, 'JJ': 868, 'NNS': 774, ',': 699, 'VBD': 660, '.': 655, '_SP': 653, 'VBN': 500, 'RB': 451, 'CD': 357, 'CC': 347, 'PRP': 338, 'VB': 328, 'VBZ': 301, 'VBG': 296, "''": 258, 'VBP': 202, 'TO': 182, 'PRP$': 145, 'POS': 111, 'HYPH': 105, 'MD': 93, 'WDT': 74, ':': 63, '-LRB-': 57, '-RRB-': 57, 'WRB': 46, 'RP': 40, 'WP': 38, 'NNPS': 35, 'JJS': 27, 'JJR': 23, 'RBS': 18, 'RBR': 18, 'EX': 14, 'NFP': 8, 'UH': 7, 'XX': 6, '$': 5, 'SYM': 4, 'PDT': 3, 'FW': 2, '``': 2})


In [19]:
token_frequencies = Counter()

for sentence in doc.sents:
    tokens = []
    token_list = ['NN','NNP','IN','DT','JJ','NNS', ',','VBD','.','_SP','VBN','RB','CD']
    for token in sentence: 
        if token.tag_ in token_list:
            tokens.append((token.tag_,token.text))
    token_frequencies.update(tokens)
    
#print(token_frequencies)



# Counter({('DT', 'the'): 723, (',', ','): 697, ('_SP', '\n'): 653, ('.', '.'): 638, ('IN', 'of'): 352, ('IN', 'in'): 281,
#('DT', 'a'): 279, ('NNP', '\\'): 211, ('IN', 'to'): 153, ('DT', 'The'): 124, ('VBD', 'was'): 111, ('IN', 'on'): 110, 
# ('IN', 'for'): 102, ('IN', 'with'): 85, ('IN', 'at'): 75, ('IN', 'as'): 69, ('IN', 'from'): 65, ('VBD', 'were'): 62})

In [20]:
taglist_frequencies = Counter()

for sentence in doc.sents:
    tag_list = []
    for token in sentence:
        tag_list.append(token.tag_)
    taglist_frequencies.update(tag_list)

#print(tag_list)
#print(taglist_frequencies)


counts = [(token.tag_, tag_list.count(token.tag_) / len(taglist_frequencies)) for token.tag_ in set(token_list)] 
print(counts)

[('IN', 0.021739130434782608), ('JJ', 0.021739130434782608), ('DT', 0.021739130434782608), ('NNP', 0.0), ('NNS', 0.0), ('VBN', 0.0), ('NN', 0.021739130434782608), ('CD', 0.0), ('VBD', 0.0), ('_SP', 0.043478260869565216), ('RB', 0.0), (',', 0.0), ('.', 0.021739130434782608)]


In [21]:
# TODO : find the 3 most frequent ones

In [22]:
spacy.explain('NN')

'noun, singular or mass'

### 3. N-Grams

N-grams are a combination of N tokens that usually co-occur. For example, the word new occurs in a lot of contexts, but the word york frequently occurs with new. So we combine the two and get `new york` to give better information. Combining 2 tokens(unigrams) gives us a **bigram**. Higher order **n-grams** are formed using 2 (n-1)-grams. 2 bigrams give a **trigram**, 2 trigrams form a quadgram and so on.

Calculate the distribution of n-grams and provide the 3 most frequent.
 - `Token bigrams`
 - `Token trigrams`
 - `POS bigrams`
 - `POS trigrams`

In [23]:
#for chunk in doc.noun_chunks:
#    print(chunk.text, chunk.root.text, chunk.root.dep_,chunk.root.head.text)

In [24]:
def n_grams(text,tokens,n):
    tokens = [token.text for token in text]
    
    return [tokens[i:i+n] for i in range(len(tokens)-n+1)]

In [25]:
token_bigrams = n_grams(doc,tokens,2)
token_trigrams = n_grams(doc,tokens,4)
#print(token_bigrams) # ['were', 'hospitalised'], ['military', 'presence']
#print(token_trigrams) # ['children', 'are', 'thought', 'to']

In [26]:
def most_frequent(List):
    counter = 0
    item = List[0]
     
    for i in List:
        curr_frequency = List.count(i)
        if(curr_frequency > counter):
            counter = curr_frequency
            item = i
 
    return item

In [27]:
most_frequent(token_bigrams)

['.', '\n']

In [28]:
most_frequent(token_trigrams)

['.', '\\', '"', '\n']

In [29]:
# TODO : find the POS bigrams and POS trigrams

### 4. Lemmatization  

Provide an example for a lemma that occurs in `more than two inflections` in the dataset.

In [30]:
def lemmatization(text):
    
    for token in text:
        if token.pos_ == 'VERB':
            print('{} -> {}'.format(token, token.lemma_))

In [31]:
#lemmatization(doc)

"""
thought -> think
aged -> age
mixed -> mix
plated -> plate
grow -> grow
feel -> feel
represented -> represent
suffering -> suffer
concerns -> concern
"""

'\nthought -> think\naged -> age\nmixed -> mix\nplated -> plate\ngrow -> grow\nfeel -> feel\nrepresented -> represent\nsuffering -> suffer\nconcerns -> concern\n'

# 5. Named Entity Recognition

Analyze the `named entities` in the *first five sentences*. Are they identified correctly? If not, explain your answer and propose a better decision

In [36]:
def ner(text):
    list_of_ent = []
    
    for ent in text.ents:
        list_of_ent.append(ent.label_)
        print(ent.text, ent.label_)
    print(len(list_of_ent)) # 1627 labels

In [39]:
#ner(doc)


# three , eight DATE
# ten years DATE
# eighteen-month-old DATE
# ROS GPE
# third ORDINAL
# three CARDINAL
# \ ORG

In [44]:
def ner_different(text):
    list_of_diff_ent = []
    
    for ent in text.ents:
        list_of_diff_ent.append(ent.label_)
    print(len(set(list_of_diff_ent))) 
    
# {'LAW', 'GPE', 'QUANTITY', 'MONEY', 'NORP', 'WORK_OF_ART', 'ORDINAL', 'TIME', 'CARDINAL', 'DATE', 
# 'ORG', 'PRODUCT', 'PERSON', 'EVENT', 'FAC', 'PERCENT', 'LOC'} 

# 17 different entity labels

In [45]:
ner_different(doc)

17


In [34]:
spacy.explain('ORG') # not correct for "\"

'Companies, agencies, institutions, etc.'

### Analyze the named entities in the first five sentences. Are they identified correctly?

In [66]:
def first_five_sentence(text):
    
    first_five_sentence = []
    for sentence in doc.sents:
        first_five_sentence.append(sentence)

    
    return(first_five_sentence[:5])

In [73]:
first_five_sentence_list = str(first_five_sentence(doc))
first_five_sentence_list

'[children are thought to be aged three , eight , and ten years , alongside an eighteen-month-old baby ., \nWe mixed different concentrations of ROS with the spores , plated them out on petridishes with an agar-solution where fungus can grow on ., \nThey feel they are under-represented in higher education and are suffering in a regional economic downturn ., \nEspecially as it concerns a third party building up its military presence near our borders ., \nPolice said three children were hospitalised for \\" severe dehydration \\" .]'

In [74]:
type(first_five_sentence_list)

str

In [81]:
ner_first_five = []
doc_5 = nlp(first_five_sentence_list)
for ent in doc_5.ents:
    ner_first_five.append((ent.text,ent.label_))
        
print(ner_first_five)
print(len(ner_first_five))

[('three , eight', 'DATE'), ('ten years', 'DATE'), ('eighteen-month-old', 'DATE'), ('ROS', 'GPE'), ('third', 'ORDINAL'), ('three', 'CARDINAL'), ('\\', 'ORG')]
7
