# PART A:  Linguistic analysis using spaCy 

In the first part of the assignment, we focus on an analysis of the sentences in the training data. 

In [1]:
import spacy
import pandas as pd
import string
from collections import Counter
nlp = spacy.load("en_core_web_sm")
punctuations = string.punctuation

In [2]:
df_file = open("../datas/sentences.txt", encoding='utf-8')
data = df_file.read()
print(data)

children are thought to be aged three , eight , and ten years , alongside an eighteen-month-old baby .
We mixed different concentrations of ROS with the spores , plated them out on petridishes with an agar-solution where fungus can grow on .
They feel they are under-represented in higher education and are suffering in a regional economic downturn .
Especially as it concerns a third party building up its military presence near our borders .
Police said three children were hospitalised for \" severe dehydration \" .
Virginia Álvarez , who wrote the report , noted , \" instead of listening to their demands , instead of starting a dialogue , authorities are doing everything they can to impede people from protesting \" .
US troops and tanks in Poland in 2015 , as part of an earlier deployment under Operation Atlantic Resolve .
Furthermore , Everson argues the Fox News criteria are not \" objective , \" as Title 11 requires , because they fail to define the terms \" consistently \" and \" re

In [3]:
doc = nlp(data) # nlp object is used to create documents with linguistic annotations

### 1. Tokenization

In [4]:
def tokenization(text):
    
    token_list = []
    
    for token in text:
        token_list.append(token.text)
        
    return token_list

In [5]:
tokenization(doc)

['children',
 'are',
 'thought',
 'to',
 'be',
 'aged',
 'three',
 ',',
 'eight',
 ',',
 'and',
 'ten',
 'years',
 ',',
 'alongside',
 'an',
 'eighteen',
 '-',
 'month',
 '-',
 'old',
 'baby',
 '.',
 '\n',
 'We',
 'mixed',
 'different',
 'concentrations',
 'of',
 'ROS',
 'with',
 'the',
 'spores',
 ',',
 'plated',
 'them',
 'out',
 'on',
 'petridishes',
 'with',
 'an',
 'agar',
 '-',
 'solution',
 'where',
 'fungus',
 'can',
 'grow',
 'on',
 '.',
 '\n',
 'They',
 'feel',
 'they',
 'are',
 'under',
 '-',
 'represented',
 'in',
 'higher',
 'education',
 'and',
 'are',
 'suffering',
 'in',
 'a',
 'regional',
 'economic',
 'downturn',
 '.',
 '\n',
 'Especially',
 'as',
 'it',
 'concerns',
 'a',
 'third',
 'party',
 'building',
 'up',
 'its',
 'military',
 'presence',
 'near',
 'our',
 'borders',
 '.',
 '\n',
 'Police',
 'said',
 'three',
 'children',
 'were',
 'hospitalised',
 'for',
 '\\',
 '"',
 'severe',
 'dehydration',
 '\\',
 '"',
 '.',
 '\n',
 'Virginia',
 'Álvarez',
 ',',
 'who',
 '

In [6]:
def num_of_tokens(text):
    
    total_token = 0
    for token in text:
        total_token += 1
    
    return total_token

In [7]:
num_of_tokens(doc) # 16130 tokens

16130

In [8]:
def num_of_types(text):
    
    """
    Type is different from the number of actual occurrences which would be known as tokens.
    """
    
    token_list = set(tokenization(text))
    return len(token_list)

In [9]:
num_of_types(doc) # 3746 types

3746

In [10]:
def num_of_words(text):
    
    """
    A word is a speech sound or a combination of sounds, or its representation in writing, 
    that symbolizes and communicates a meaning and may consist of a single morpheme or a combination of morphemes.
    """
    
    total_words = 0
    words = []
    
    for token in text:
        if token.text not in punctuations and token.text != '\n':
            total_words += 1
            words.append(token.text)
    
    return total_words, words

In [11]:
num_of_words(doc)# 13265 words

(13265,
 ['children',
  'are',
  'thought',
  'to',
  'be',
  'aged',
  'three',
  'eight',
  'and',
  'ten',
  'years',
  'alongside',
  'an',
  'eighteen',
  'month',
  'old',
  'baby',
  'We',
  'mixed',
  'different',
  'concentrations',
  'of',
  'ROS',
  'with',
  'the',
  'spores',
  'plated',
  'them',
  'out',
  'on',
  'petridishes',
  'with',
  'an',
  'agar',
  'solution',
  'where',
  'fungus',
  'can',
  'grow',
  'on',
  'They',
  'feel',
  'they',
  'are',
  'under',
  'represented',
  'in',
  'higher',
  'education',
  'and',
  'are',
  'suffering',
  'in',
  'a',
  'regional',
  'economic',
  'downturn',
  'Especially',
  'as',
  'it',
  'concerns',
  'a',
  'third',
  'party',
  'building',
  'up',
  'its',
  'military',
  'presence',
  'near',
  'our',
  'borders',
  'Police',
  'said',
  'three',
  'children',
  'were',
  'hospitalised',
  'for',
  'severe',
  'dehydration',
  'Virginia',
  'Álvarez',
  'who',
  'wrote',
  'the',
  'report',
  'noted',
  'instead',

In [12]:
def average_num_of_words_per_sentence(text):

    words_list = tokenization(text)

    terminals = set([".", "?", "!"])
    terminal_count = 0

    for word in words_list:
        if word in terminals:
            terminal_count += 1


    avg_num_words = (len(words_list) - terminal_count)  / float(terminal_count)
    
    return round(avg_num_words,2) # 23.63 number of words per sentence

In [13]:
average_num_of_words_per_sentence(doc)

23.63

In [14]:
def average_word_length(text):

    words_list = tokenization(text)
    avg_num_words = 0
    
    avg_num_words = sum(len(word) for word in words_list) / len(words_list)            
    
    return round(avg_num_words,2) # 4.21 average word length

In [15]:
average_word_length(doc)

4.21

### 2. Word Classes 

Run the default part-of-speech tagger on the dataset and identify the ten most frequent `POS tags`. Complete the table for these ten tags.

In [21]:
print(f"TOKEN\tPOS\tTAG".format('Token','Lemma','Pos'))
print("-"*50)

for token in doc:
    print(token.text, token.pos_, token.tag_)

TOKEN	POS	TAG
--------------------------------------------------
children NOUN NNS
are AUX VBP
thought VERB VBN
to PART TO
be AUX VB
aged VERB VBN
three NUM CD
, PUNCT ,
eight NUM CD
, PUNCT ,
and CCONJ CC
ten NUM CD
years NOUN NNS
, PUNCT ,
alongside ADP IN
an DET DT
eighteen NUM CD
- PUNCT HYPH
month NOUN NN
- PUNCT HYPH
old ADJ JJ
baby NOUN NN
. PUNCT .

 SPACE _SP
We PRON PRP
mixed VERB VBD
different ADJ JJ
concentrations NOUN NNS
of ADP IN
ROS PROPN NNP
with ADP IN
the DET DT
spores NOUN NNS
, PUNCT ,
plated VERB VBD
them PRON PRP
out ADP RP
on ADP IN
petridishes NOUN NNS
with ADP IN
an DET DT
agar NOUN NN
- PUNCT HYPH
solution NOUN NN
where SCONJ WRB
fungus NOUN NN
can AUX MD
grow VERB VB
on ADP RP
. PUNCT .

 SPACE _SP
They PRON PRP
feel VERB VBP
they PRON PRP
are AUX VBP
under ADV RB
- PUNCT HYPH
represented VERB VBN
in ADP IN
higher ADJ JJR
education NOUN NN
and CCONJ CC
are AUX VBP
suffering VERB VBG
in ADP IN
a DET DT
regional ADJ JJ
economic ADJ JJ
downturn NOUN NN
. PUNCT 

In [22]:
tag_frequencies = Counter()

for sentence in doc.sents:
    tags = []
    for token in sentence: 
        if not token.is_punct:
            tags.append(token.tag_)
    tag_frequencies.update(tags)
    
print(tag_frequencies)

Counter({'NN': 2008, 'NNP': 1851, 'IN': 1745, 'DT': 1378, 'JJ': 853, 'NNS': 774, 'VBD': 660, '_SP': 653, 'VBN': 499, 'RB': 451, 'CD': 357, 'CC': 346, 'PRP': 338, 'VB': 326, 'VBZ': 301, 'VBG': 295, 'VBP': 195, 'TO': 182, 'PRP$': 145, 'POS': 94, 'MD': 93, 'WDT': 74, 'WRB': 46, 'RP': 40, 'WP': 38, 'NNPS': 35, 'JJS': 27, 'JJR': 23, 'RBS': 18, 'RBR': 18, 'EX': 14, 'UH': 7, '$': 4, 'PDT': 3, 'XX': 2, 'FW': 2})


In [23]:
token_frequencies = Counter()

for sentence in doc.sents:
    tokens = []
    for token in sentence: 
        if not token.is_punct:
            tokens.append(token.text)
    token_frequencies.update(tokens)
    
print(token_frequencies)

Counter({'the': 723, '\n': 653, 'of': 352, 'to': 333, 'and': 286, 'in': 285, 'a': 279, 'The': 124, 'on': 116, 'was': 111, "'s": 110, 'for': 102, 'with': 85, 'at': 84, 'is': 84, 'that': 79, 'as': 76, 'from': 65, 'has': 65, 'were': 62, 'it': 58, 'said': 56, 'he': 55, 'have': 55, 'his': 53, 'by': 52, 'are': 50, 'an': 49, 'not': 49, 'be': 48, 'this': 47, 'been': 46, 'ants': 42, 'they': 37, 'which': 37, 'their': 34, 'also': 34, 'had': 33, 'or': 31, 'who': 30, 'one': 30, 'about': 30, 'would': 30, 'year': 29, 'police': 27, 'other': 26, 'I': 26, 'but': 25, 'It': 25, 'our': 24, 'report': 24, 'US': 24, 'after': 24, 'He': 24, 'time': 23, 'In': 23, 'We': 22, 'two': 22, 'more': 22, 'we': 22, 'before': 20, 'Russian': 20, 'This': 20, 'can': 19, 'military': 19, 'troops': 19, 'Republican': 19, 'President': 19, 'do': 19, 'U.S.': 19, 'government': 18, 'Minister': 18, 'Wikinews': 18, 'any': 18, 'presidential': 18, 'its': 17, 'least': 17, 'many': 17, 'into': 16, 'some': 16, 'out': 15, 'people': 15, 'News':

In [None]:
# TODO : find the 3 most frequent ones

### 3. N-Grams

N-grams are a combination of N tokens that usually co-occur. For example, the word new occurs in a lot of contexts, but the word york frequently occurs with new. So we combine the two and get `new york` to give better information. Combining 2 tokens(unigrams) gives us a **bigram**. Higher order **n-grams** are formed using 2 (n-1)-grams. 2 bigrams give a **trigram**, 2 trigrams form a quadgram and so on.

Calculate the distribution of n-grams and provide the 3 most frequent.
 - `Token bigrams`
 - `Token trigrams`
 - `POS bigrams`
 - `POS trigrams`

In [None]:
#for chunk in doc.noun_chunks:
#    print(chunk.text, chunk.root.text, chunk.root.dep_,chunk.root.head.text)

In [None]:
def n_grams(text,tokens,n):
    tokens = [token.text for token in text]
    
    return [tokens[i:i+n] for i in range(len(tokens)-n+1)]

In [None]:
token_bigrams = n_grams(doc,tokens,2)
token_trigrams = n_grams(doc,tokens,4)
#print(token_bigrams)
print(token_trigrams) # ['children', 'are', 'thought', 'to']

In [None]:
def most_frequent(List):
    counter = 0
    item = List[0]
     
    for i in List:
        curr_frequency = List.count(i)
        if(curr_frequency > counter):
            counter = curr_frequency
            item = i
 
    return item

In [None]:
most_frequent(token_bigrams)

In [None]:
most_frequent(token_trigrams)

In [None]:
# TODO : find the POS bigrams and POS trigrams

### 4. Lemmatization  

Provide an example for a lemma that occurs in `more than two inflections` in the dataset.

In [27]:
def lemmatization(text):
    
    for token in text:
        if token.pos_ == 'VERB':
            print('{} -> {}'.format(token, token.lemma_))

In [28]:
lemmatization(doc)

thought -> think
aged -> age
mixed -> mix
plated -> plate
grow -> grow
feel -> feel
represented -> represent
suffering -> suffer
concerns -> concern
building -> build
said -> say
hospitalised -> hospitalise
wrote -> write
noted -> note
listening -> listen
starting -> start
doing -> do
impede -> impede
protesting -> protest
argues -> argue
requires -> require
fail -> fail
define -> define
recognized -> recognize
referring -> refer
challenged -> challenge
reported -> report
evacuated -> evacuate
landed -> land
left -> leave
came -> come
avoiding -> avoid
falling -> fall
accused -> accuse
moving -> move
left -> leave
flying -> fly
relates -> relate
signs -> sign
announced -> announce
signed -> sign
working -> work
called -> call
known -> know
filed -> file
obtained -> obtain
held -> hold
citing -> cite
want -> want
extended -> extend
threaten -> threaten
used -> use
described -> describe
deliver -> deliver
challenging -> challenge
issue -> issue
starts -> start
analysed -> analyse
shows -

# 5. Named Entity Recognition

Analyze the `named entities` in the *first five sentences*. Are they identified correctly? If not, explain your answer and propose a better decision

In [87]:
def ner(text):
    
    for ent in text.ents:
        print(ent.text, ent.label_)

In [88]:
ner(doc)

"""
three , eight DATE
ten years DATE
eighteen-month-old DATE
ROS GPE
third ORDINAL
three CARDINAL
\ ORG
"""

three , eight DATE
ten years DATE
eighteen-month-old DATE
ROS GPE
third ORDINAL
three CARDINAL
\ ORG
Virginia Álvarez PERSON
US GPE
Poland GPE
2015 DATE
Operation Atlantic Resolve ORG
Everson PERSON
Fox News ORG
\ ORG
Title 11 LAW
the High Court ORG
South Korea GPE
about 80 CARDINAL
Yeoncheon ORG
at least one CARDINAL
170 tonnes QUANTITY
One CARDINAL
34 CARDINAL
ten CARDINAL
Austria GPE
Serbia GPE
Oksana GPE
two CARDINAL
Ukraine GPE
mid-December DATE
France GPE
Thursday DATE
Austria GPE
Robben PERSON
Bayern German NORP
Bayern Munich ORG
yesterday DATE
Dutch NORP
Arjen Robben PERSON
one CARDINAL
June 2018 DATE
Anastasia Slonina PERSON
Moscow GPE
Teatr.doc ORG
Pavlensky PERSON
Oksana ORG
Bos PERSON
the University of Copenhagen ORG
Hungary GPE
72 hours TIME
Bianchi PERSON
Philippe PERSON
daily DATE
Next week DATE
January 24 DATE
the Supreme Court ORG
Article 50 LAW
Brexit PERSON
Parliament ORG
Maldives ORG
Malaysian NORP
Hishamuddin Hussein PERSON
three CARDINAL
239 CARDINAL
Bianchi PERSO