# PART A:  Linguistic analysis using spaCy 

In the first part of the assignment, we focus on an analysis of the sentences in the training data. 

In [1]:
import spacy
import pandas as pd
import string
from collections import Counter
nlp = spacy.load("en_core_web_sm")
punctuations = string.punctuation

In [2]:
df_file = open("../datas/sentences.txt", encoding='utf-8')
data = df_file.read()
# print(data)

In [3]:
doc = nlp(data) # nlp object is used to create documents with linguistic annotations

### 1. Tokenization

In [4]:
def tokenization(text):
    
    token_list = []
    
    for token in text:
        token_list.append(token.text)
        
    return token_list

In [5]:
 # tokenization(doc)
"""
['children',
 'are',
 'thought',
 'to',
 'be',
 'aged',
 'three',...]
"""

"\n['children',\n'are',\n'thought',\n'to',\n'be',\n'aged',\n'three',...]\n"

In [6]:
def num_of_tokens(text):
    
    total_token = 0
    for token in text:
        total_token += 1
    
    return total_token

In [7]:
num_of_tokens(doc) # 16130 tokens

16130

In [8]:
def num_of_types(text):
    
    """
    Type is different from the number of actual occurrences which would be known as tokens.
    """
    
    token_list = set(tokenization(text))
    return len(token_list)

In [9]:
num_of_types(doc) # 3746 types

3746

In [10]:
def num_of_words(text):
    
    """
    A word is a speech sound or a combination of sounds, or its representation in writing, 
    that symbolizes and communicates a meaning and may consist of a single morpheme or a combination of morphemes.
    """
    
    total_words = 0
    words = []
    
    for token in text:
        if token.text not in punctuations and token.text != '\n':
            total_words += 1
            words.append(token.text)
    
    return total_words, words

In [11]:
# num_of_words(doc)# 13265 words

In [12]:
def average_num_of_words_per_sentence(text):

    words_list = tokenization(text)

    terminals = set([".", "?", "!"])
    terminal_count = 0

    for word in words_list:
        if word in terminals:
            terminal_count += 1


    avg_num_words = (len(words_list) - terminal_count)  / float(terminal_count)
    
    return round(avg_num_words,2) # 23.63 number of words per sentence

In [13]:
average_num_of_words_per_sentence(doc)

23.63

In [14]:
def average_word_length(text):

    words_list = tokenization(text)
    avg_num_words = 0
    
    avg_num_words = sum(len(word) for word in words_list) / len(words_list)            
    
    return round(avg_num_words,2) # 4.21 average word length

In [15]:
average_word_length(doc)

4.21

### 2. Word Classes 

Run the default part-of-speech tagger on the dataset and identify the ten most frequent `POS tags`. Complete the table for these ten tags.

In [16]:
print(f"TOKEN\tPOS\tTAG".format('Token','Lemma','Pos'))
print("-"*50)

for token in doc:
    #print(token.text, token.pos_, token.tag_)
    

    """

TOKEN    POS    TAG
--------------------------------------------------
children NOUN NNS
are AUX VBP
thought VERB VBN
to PART TO
be AUX VB
aged VERB VBN
    """

TOKEN	POS	TAG
--------------------------------------------------


In [17]:
tag_frequencies = Counter()

for sentence in doc.sents:
    tags = []
    for token in sentence: 
        if not token.is_punct:
            tags.append(token.tag_)
    tag_frequencies.update(tags)
    
print(tag_frequencies)

Counter({'NN': 2008, 'NNP': 1851, 'IN': 1745, 'DT': 1378, 'JJ': 853, 'NNS': 774, 'VBD': 660, '_SP': 653, 'VBN': 499, 'RB': 451, 'CD': 357, 'CC': 346, 'PRP': 338, 'VB': 326, 'VBZ': 301, 'VBG': 295, 'VBP': 195, 'TO': 182, 'PRP$': 145, 'POS': 94, 'MD': 93, 'WDT': 74, 'WRB': 46, 'RP': 40, 'WP': 38, 'NNPS': 35, 'JJS': 27, 'JJR': 23, 'RBS': 18, 'RBR': 18, 'EX': 14, 'UH': 7, '$': 4, 'PDT': 3, 'XX': 2, 'FW': 2})


In [18]:
token_frequencies = Counter()

for sentence in doc.sents:
    tokens = []
    for token in sentence: 
        if not token.is_punct:
            tokens.append(token.text)
    token_frequencies.update(tokens)
    
# print(token_frequencies)

"""
Counter({'the': 723, '\n': 653, 'of': 352, 'to': 333, 'and': 286, 'in': 285, 'a': 279, 'The': 124,
'are': 50, 'an': 49, 'not': 49, 'be': 48, 'this': 47, 'been': 46, 'ants': 42, 'they': 37, 'which': 37, 
'their': 34, 'also': 34, 'had': 33, 'or': 31, 'who': 30, 'one': 30, 'about': 30, 'would': 30, 'year': 29,
'police': 27, 'other': 26, 'I': 26, 'but': 25, 'It': 25, 'our': 24, 'report': 24, 'US': 24, 'after': 24, 
'He': 24, 'time': 23, 'In': 23, 'We': 22, 'two': 22, 'more': 22, 'we': 22, 'before': 20, 'Russian': 20, 
'This': 20, 'can': 19, 'military': 19, 'troops': 19, 'Republican': 19, 'President': 19, 'do': 19, 'U.S.': 19, 
'government': 18, 'Minister': 18, 'Wikinews': 18, 'any': 18, 'presidential': 18, 'its': 17, 'least': 17, 
'many': 17, 'into': 16, 'some': 16, 'out': 15, 'people': 15, 'News': 15, 'last': 15, 'law': 15, 'over': 15, 
'car': 15, 'Wilson': 15, 'during': 15, 'former': 15, 'Obama': 15, 'where': 14, 'Police': 14, 'when': 14, 
'reported': 14, 'driver': 14, 'Pavlensky': 14, 'lorry': 14, 'case': 14, 'no': 14, 'dead': 14, 'very': 14, 
'if': 14, 'local': 14, 'Russia': 14})
"""

"\nCounter({'the': 723, '\n': 653, 'of': 352, 'to': 333, 'and': 286, 'in': 285, 'a': 279, 'The': 124,\n'are': 50, 'an': 49, 'not': 49, 'be': 48, 'this': 47, 'been': 46, 'ants': 42, 'they': 37, 'which': 37, \n'their': 34, 'also': 34, 'had': 33, 'or': 31, 'who': 30, 'one': 30, 'about': 30, 'would': 30, 'year': 29,\n'police': 27, 'other': 26, 'I': 26, 'but': 25, 'It': 25, 'our': 24, 'report': 24, 'US': 24, 'after': 24, \n'He': 24, 'time': 23, 'In': 23, 'We': 22, 'two': 22, 'more': 22, 'we': 22, 'before': 20, 'Russian': 20, \n'This': 20, 'can': 19, 'military': 19, 'troops': 19, 'Republican': 19, 'President': 19, 'do': 19, 'U.S.': 19, \n'government': 18, 'Minister': 18, 'Wikinews': 18, 'any': 18, 'presidential': 18, 'its': 17, 'least': 17, \n'many': 17, 'into': 16, 'some': 16, 'out': 15, 'people': 15, 'News': 15, 'last': 15, 'law': 15, 'over': 15, \n'car': 15, 'Wilson': 15, 'during': 15, 'former': 15, 'Obama': 15, 'where': 14, 'Police': 14, 'when': 14, \n'reported': 14, 'driver': 14, 'Pavle

In [19]:
# TODO : find the 3 most frequent ones

### 3. N-Grams

N-grams are a combination of N tokens that usually co-occur. For example, the word new occurs in a lot of contexts, but the word york frequently occurs with new. So we combine the two and get `new york` to give better information. Combining 2 tokens(unigrams) gives us a **bigram**. Higher order **n-grams** are formed using 2 (n-1)-grams. 2 bigrams give a **trigram**, 2 trigrams form a quadgram and so on.

Calculate the distribution of n-grams and provide the 3 most frequent.
 - `Token bigrams`
 - `Token trigrams`
 - `POS bigrams`
 - `POS trigrams`

In [20]:
#for chunk in doc.noun_chunks:
#    print(chunk.text, chunk.root.text, chunk.root.dep_,chunk.root.head.text)

In [21]:
def n_grams(text,tokens,n):
    tokens = [token.text for token in text]
    
    return [tokens[i:i+n] for i in range(len(tokens)-n+1)]

In [22]:
token_bigrams = n_grams(doc,tokens,2)
token_trigrams = n_grams(doc,tokens,4)
#print(token_bigrams)
print(token_trigrams) # ['children', 'are', 'thought', 'to']

[['children', 'are', 'thought', 'to'], ['are', 'thought', 'to', 'be'], ['thought', 'to', 'be', 'aged'], ['to', 'be', 'aged', 'three'], ['be', 'aged', 'three', ','], ['aged', 'three', ',', 'eight'], ['three', ',', 'eight', ','], [',', 'eight', ',', 'and'], ['eight', ',', 'and', 'ten'], [',', 'and', 'ten', 'years'], ['and', 'ten', 'years', ','], ['ten', 'years', ',', 'alongside'], ['years', ',', 'alongside', 'an'], [',', 'alongside', 'an', 'eighteen'], ['alongside', 'an', 'eighteen', '-'], ['an', 'eighteen', '-', 'month'], ['eighteen', '-', 'month', '-'], ['-', 'month', '-', 'old'], ['month', '-', 'old', 'baby'], ['-', 'old', 'baby', '.'], ['old', 'baby', '.', '\n'], ['baby', '.', '\n', 'We'], ['.', '\n', 'We', 'mixed'], ['\n', 'We', 'mixed', 'different'], ['We', 'mixed', 'different', 'concentrations'], ['mixed', 'different', 'concentrations', 'of'], ['different', 'concentrations', 'of', 'ROS'], ['concentrations', 'of', 'ROS', 'with'], ['of', 'ROS', 'with', 'the'], ['ROS', 'with', 'the',

In [23]:
def most_frequent(List):
    counter = 0
    item = List[0]
     
    for i in List:
        curr_frequency = List.count(i)
        if(curr_frequency > counter):
            counter = curr_frequency
            item = i
 
    return item

In [24]:
most_frequent(token_bigrams)

['.', '\n']

In [None]:
most_frequent(token_trigrams)

In [None]:
# TODO : find the POS bigrams and POS trigrams

### 4. Lemmatization  

Provide an example for a lemma that occurs in `more than two inflections` in the dataset.

In [None]:
def lemmatization(text):
    
    for token in text:
        if token.pos_ == 'VERB':
            print('{} -> {}'.format(token, token.lemma_))

In [None]:
#lemmatization(doc)

"""
thought -> think
aged -> age
mixed -> mix
plated -> plate
grow -> grow
feel -> feel
represented -> represent
suffering -> suffer
concerns -> concern
"""

# 5. Named Entity Recognition

Analyze the `named entities` in the *first five sentences*. Are they identified correctly? If not, explain your answer and propose a better decision

In [None]:
def ner(text):
    
    for ent in text.ents:
        print(ent.text, ent.label_)

In [None]:
#ner(doc)

"""
three , eight DATE
ten years DATE
eighteen-month-old DATE
ROS GPE
third ORDINAL
three CARDINAL
\ ORG
"""