## **Delivery n°3 - Mathias Lommel**

### *1 - Libraries Importation*

In [187]:
# All the necessary imports
import re
import string
import matplotlib.pyplot as plt
import pandas as pd

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer

import sklearn
import sklearn.feature_extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Importation of SpaCy
import spacy
nlp = spacy.load('en_core_web_sm')

from spacy import displacy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### *2 - Definition of the functions*

In [241]:
# I couldn't download the language library on my own computer, then, I have done this third delivery on Google Colab.
# As a consequence, I have computed 2 different ways to read the input document :
##     one using Colab (the document being on "My Drive")
##     another one, which can be used if the document is directly on the computer.

def read_document_with_Drive():
  # Read the document on Drive
  from google.colab import drive
  drive.mount('/content/drive')

  # Reading of the .txt document via Colab
  path = '/content/drive/My Drive/Romeo_and_Juliet.txt'

  # Creation of the corpus
  original_corpus=[]
  with open(path, 'r', errors = 'ignore') as f:
    # We add each line to the corpus
    for line in f:
      original_corpus.append(line.strip())

  print("Document has been read successfully.")
  print("This corpus contains", len(original_corpus), "documents.")

  return original_corpus

def read_document():
  # Read the document directly on the computer
  with open("Romeo_and_Juliet.txt", "r") as file:
      # We create the corpus
      original_corpus = file.readlines()

  print("Document has been read successfully.")
  print("This corpus contains", len(original_corpus), "documents.")
  return original_corpus

In [242]:
# Function that cleans the corpus
def preprocess_corpus(original_corpus):
  cleaned_corpus = []

  k=0
  # We clean each document of the corpus
  for document in original_corpus:
    # Change to lower case
    document = document.lower()

    # Remove URLs (http and https)
    document = re.sub("http?:\/\/.*[\r\n]*", "", document)
    document = re.sub("https?:\/\/.*[\r\n]*", "", document)

    # Remove emails
    document= re.sub(r'\b[A-Za-z0-9._-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b','',document)

    # Remove mentions
    document = re.sub("@\S+", "", document)

    # Remove punctuations, commas and special characters
    punctuation = string.punctuation
    translation_table = str.maketrans('', '', punctuation)

    document = document.translate(translation_table)

    # Remove numbers
    document = re.sub(r'\d+', '', document)

    # Remove the \n
    document = re.sub('\\n', '', document)

    if (not document.isspace()) and document != '':
        # If the tweet is still interesting
        cleaned_corpus.append(document)
    else:
        # If not, we delete it from the original corpus
        original_corpus.pop(k)

    k += 1

  print("Pre-processing succesfully computed.")
  print("Length of the initial data : ",len(original_corpus))
  print("Length of the cleaned data : ",len(cleaned_corpus))

  return (original_corpus, cleaned_corpus)

In [244]:
# Function that computes the tokens of the corpus
def get_tokens (corpus):
  tokens = []
  # For each document of the corpus
  for document in corpus:
    doc_tokens = nlp(document)

    for token in doc_tokens:
      # We add the token if it's not already done
      if not str(token) in str(tokens):
        tokens.append(token)

  print("The corpus contains",len(tokens),"different tokens.")
  return tokens

In [245]:
# Function that removes the stop words
def remove_stopWords (tokens,stop_words=''):
  if stop_words == '':
    # 2 options :
    ##  - give the stop_words that we want to remove
    ##  - remove the default stop words
    stop_words = set(stopwords.words('english'))

  tokens_sw = [token for token in tokens if str(token) not in stop_words]

  print("Before removing stop words :",len(tokens), "words")
  print("After removing stop words :",len(tokens_sw), "words")

  return tokens_sw

In [247]:
# Function that determines the proper nouns of the corpus
def get_properNames (tokens):
  proper_names = []
  # We browse each token, and see if it's a PROPN
  for token in tokens:
    if str(token.pos_) == "PROPN":
      proper_names.append(token)

  return proper_names

In [248]:
# Function that determines the nouns of the corpus
def get_Nouns (tokens):
  nouns = []
  # We browse each token, and see if it's a NOUN
  for token in tokens:
    if str(token.pos_) == "NOUN":
      nouns.append(token)

  print("The corpus contains",len(nouns),"nouns.")
  return nouns

In [249]:
# To get the Document Term Matrix
def get_dtm(corpus):
    vec = CountVectorizer()
    X = vec.fit_transform(corpus)
    # Creation of the matrix
    term_matrix = pd.DataFrame(X.toarray(), columns = vec.get_feature_names_out())

    return term_matrix

In [250]:
# To get the most repeated nouns of the corpus
def get_mostRepeated_noun(corpus, tokens):
  # We get the document term matrix
  term_matrix = get_dtm(corpus)

  # We get all the nouns of the corpus
  nouns = get_Nouns(tokens)

  # We compute the occurences of each word of the corpus, and we sort it
  words_occurences = term_matrix.sum(axis = 1)
  sorted = words_occurences.sort_values(ascending=False)
  words = term_matrix.columns

  # We initialize the list (result of the function)
  mostRepeated = []
  nb_occ = 0

  for i in sorted.index:
    # If the word is a noun
    if words[i] in str(nouns):
      # If the noun is one of the most repeated
      if sorted[i] >= nb_occ:
        # We add it to the list
        mostRepeated.append([words[i], sorted[i]])
        # We change nb_occ (useful for the first iteration)
        nb_occ = sorted[i]
      else:
        # If not, we have seen all most repeated nouns : we return the list
        return mostRepeated

In [254]:
# To get the frequencies of each root word of the corpus
# 1st VERSION : considers all the tokens (tokens can be repeated)
def root_words(corpus):
  # We initialize a dictionary to get the frequencies of each root word
  token_root = {}
  s_stemmer = SnowballStemmer(language='english')

  # We browse all documents
  for document in corpus:
    doc_tokens = nlp(document)
    # For each token of the document
    for token in doc_tokens:
      root_word = s_stemmer.stem(str(token))

      # We add 1 to the frequency of the token's root
      if str(root_word) in token_root:
        token_root[str(root_word)] += 1
      else:
        token_root[str(root_word)] = 1

  return token_root

In [255]:
# To get the n more used root words of the corpus (1st VERSION)
def more_frequent_roots(corpus, n):
  # We get the frequencies of each root word
  token_root = root_words(corpus)
  # We determine the n more sed words
  root_df = pd.DataFrame(token_root.values(), index = token_root.keys())
  root_freq = root_df.nlargest(15,root_df.columns)

  return root_freq


In [256]:
# To get the frequencies of each root word of the corpus
# 2nd VERSION : the tokens are not repeated
def root_words2(tokens):
  # We initialize a dictionary to get the frequencies of each root word
  token_root = {}
  s_stemmer = SnowballStemmer(language='english')

  # We browse all documents
  for token in tokens:
    root_word = s_stemmer.stem(str(token))

    # We add 1 to the frequency of the token's root
    if str(root_word) in token_root:
      token_root[str(root_word)] += 1
    else:
      token_root[str(root_word)] = 1

  return token_root

In [257]:
# To get the n more used root words of the corpus (2nd VERSION)
def more_frequent_roots2(tokens, n):
  # We get the frequencies of each root word
  token_root = root_words2(tokens)
  # We determine the n more sed words
  root_df = pd.DataFrame(token_root.values(), index = token_root.keys())
  root_freq = root_df.nlargest(15,root_df.columns)

  return root_freq

### *3 - Test of our functions*

In [258]:
# We create the corpus
print("*********************** Pre-processing *************************")
original_corpus = read_document_with_Drive()
# OR
#original_corpus = read_document()

# We pre-process the corpus
Data = preprocess_corpus(original_corpus)
original_corpus = Data[0]
cleaned_corpus = Data[1]
print("****************************************************************\n")


*********************** Pre-processing *************************
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Document has been read successfully.
This corpus contains 364 documents.
Pre-processing succesfully computed.
Length of the initial data :  352
Length of the cleaned data :  340
****************************************************************



In [259]:
# Find the tokens of the corpus
print("************************ Tokenization **************************")
tokens = get_tokens(cleaned_corpus)

# Remove the stop words : we remove the default stop words list
tokens_sw = remove_stopWords(tokens)
print("We can see that the number of tokens has changed.")
print("****************************************************************\n")

************************ Tokenization **************************
The corpus contains 584 different tokens.
Before removing stop words : 584 words
After removing stop words : 522 words
We can see that the number of tokens has changed.
****************************************************************



In [260]:
print("************************ Knowledge base ************************")
for token in tokens_sw:
  print(token.text, '\t', token.pos_, '\t', token.lemma_)
print("****************************************************************\n")

************************ Knowledge base ************************
gregory 	 NOUN 	 gregory
shows 	 VERB 	 show
thee 	 PRON 	 thee
weak 	 ADJ 	 weak
slave 	 NOUN 	 slave
weakest 	 ADJ 	 weak
goes 	 VERB 	 go
wall 	 NOUN 	 wall
sampson 	 NOUN 	 sampson
true 	 ADJ 	 true
therefore 	 ADV 	 therefore
women 	 NOUN 	 woman
weaker 	 ADJ 	 weak
vessels 	 NOUN 	 vessel
ever 	 ADV 	 ever
thrust 	 VERB 	 thrust
push 	 VERB 	 push
montagues 	 VERB 	 montague
maids 	 NOUN 	 maid
quarrel 	 NOUN 	 quarrel
masters 	 NOUN 	 master
tis 	 VERB 	 tis
one 	 NOUN 	 one
tyrant 	 NOUN 	 tyrant
fought 	 VERB 	 fight
cruel 	 ADJ 	 cruel
cut 	 VERB 	 cut
heads 	 NOUN 	 head
ay 	 VERB 	 ay
maidenheads 	 NOUN 	 maidenhead
take 	 VERB 	 take
sense 	 NOUN 	 sense
thou 	 NOUN 	 thou
wilt 	 NOUN 	 wilt
must 	 AUX 	 must
feel 	 VERB 	 feel
shall 	 AUX 	 shall
able 	 ADJ 	 able
stand 	 VERB 	 stand
known 	 VERB 	 know
pretty 	 ADJ 	 pretty
piece 	 NOUN 	 piece
esh 	 NOUN 	 esh
well 	 INTJ 	 well
art 	 NOUN 	 art
hadst 	 P

In [262]:
print("************************* Proper names *************************")
proper_names = get_properNames(tokens_sw)
print(proper_names,"\n")
print("We can see that we don't have all the proper names of the document. For example, <<Gregory>> and <<Sampson>> do not appear in this list.")
print("Moreover, in this list, some common nouns have been classified as proper nouns (as <<hell>> or <<sun>> for example).\n")

# Try to find Gregory and Sampson in the list
searched_words = ["gregory", "sampson"]
for token in tokens_sw:
  if str(token.text) in searched_words:
    print(token.text, token.pos_, token.dep_)

print("We can see that <<Gregory>> and <<Sampson>> have been classified as Nouns and not proper names.")
print("We can try to change them to see if any change is observed : we can try to write them with upper case letters.\n")

# We modify our cleaned_corpus
cleaned_corpus_2 = []
for document in cleaned_corpus:
  document_test = document.replace("gregory","GREGORY")
  document_test = document_test.replace("sampson","SAMPSON")
  cleaned_corpus_2.append(document_test)

# Find the tokens of the corpus
tokens_2 = get_tokens(cleaned_corpus_2)

# Remove the stop words : we remove the default stop words list
tokens_sw_2 = remove_stopWords(tokens_2)

proper_names_2 = get_properNames(tokens_sw_2)
print(proper_names_2,"\n")

print("By doing this change, we can see that <<Gregory>> is now classified as a proper name.\n")

searched_words = ["GREGORY", "SAMPSON"]
for token in tokens_sw_2:
  if str(token.text) in searched_words:
    print(token.text, token.pos_, token.dep_)

print("Nevertheless, <<Sampson>> is still classified as a noun...")
print("****************************************************************\n")


************************* Proper names *************************
[hadst, romeo, juliet, william, shakespeare, john, abraham, hell, villain, speak, nephew, madam, worshippd, sun, goodmorrow, cousin, misshapen, hast, shell, dians, armd] 

We can see that we don't have all the proper names of the document. For example, <<Gregory>> and <<Sampson>> do not appear in this list.
Moreover, in this list, some common nouns have been classified as proper nouns (as <<hell>> or <<sun>> for example).

gregory NOUN ROOT
sampson NOUN ROOT
We can see that <<Gregory>> and <<Sampson>> have been classified as Nouns and not proper names.
We can try to change them to see if any change is observed : we can try to write them with upper case letters.

The corpus contains 587 different tokens.
Before removing stop words : 587 words
After removing stop words : 523 words
[GREGORY, hadst, romeo, juliet, william, shakespeare, john, abraham, hell, villain, speak, nephew, madam, worshippd, sun, goodmorrow, cousin, mis

In [268]:
print("************************* Text Mapping *************************")
document = cleaned_corpus_2[12]
print("Document selected : ", document)

displacy.render(nlp(document), style='dep', jupyter=True, options={'distance': 110})
print("****************************************************************\n")

************************* Text Mapping *************************
Document selected :  have fought with the men i will be cruel with the


****************************************************************



In [270]:
print("**************************** Nouns *****************************")
# Get the number of nouns in the corpus
most_repeated = get_mostRepeated_noun(cleaned_corpus_2, tokens_sw_2)
print("The most repeated noun in the corpus is <<", most_repeated[0][0],">>. This noun is repeated", most_repeated[0][1],"times.\n")
print("This word is quite often used in this corpus. In fact, this corpus corresponds to the 1st scene of the 1st act of Romeo and Juliet.")
print("And, in this scene, <<chamber>> is mentioned : ")
print("- when the 2 servants speak about the importance of the chamber, and their desire to defend their masters' honor.")
print("       --> Here, the chamber corresponds to a symbol of social status")
print("- during the argument between the servants")
print("       --> Here, chamber is used as its first meaning : a part of the house.\n")

print("Hence, the chamber is mentioned as a symbol of social status, is associated to intimity and secrets, and is a potential place of conflicts")
print("All those remarks could explain why this word is such used in the corpus.")
print("****************************************************************\n")

**************************** Nouns *****************************
The corpus contains 213 nouns.
The most repeated noun in the corpus is << chamber >>. This noun is repeated 12 times.

This word is quite often used in this corpus. In fact, this corpus corresponds to the 1st scene of the 1st act of Romeo and Juliet.
And, in this scene, <<chamber>> is mentioned : 
- when the 2 servants speak about the importance of the chamber, and their desire to defend their masters' honor.
       --> Here, the chamber corresponds to a symbol of social status
- during the argument between the servants
       --> Here, chamber is used as its first meaning : a part of the house.

Hence, the chamber is mentioned as a symbol of social status, is associated to intimity and secrets, and is a potential place of conflicts
All those remarks could explain why this word is such used in the corpus.
****************************************************************



In [278]:
print("************************** Root Words **************************")
print("Root words from corpus that have frequency higher than 15 :")
print("\n 1 - By considering all the tokens (which could be repeated)")
print(more_frequent_roots(cleaned_corpus_2,15))
print("\n 2 - By considering unique tokens (non repeated)")
print(more_frequent_roots2(tokens_sw_2,15))

print("\n")
print("We can see that the first list is not really interesting to find useful information (most of them are prepositions or linkers).")
print("We can just deduce that Romeo and Benvolio are part of the main characters of the book.")
print("Also, because <<I>> and <<You>> are repeated quite often, we can deduce that there are some dialogs.")
print("Nevertheless, this piece of information could have been obtained, knowing the title of ")
print("the book, and that it is a drama.")
print("\n")
print("When we consider only unique tokens, we get more useful information. Certainly, the frequencies")
print("are well smaller, but the words obtained give us more information : this book is about Love, Sadness, ")
print("conflict; with Romeo as main character.")
print("\n")
print("But, by combining those 2 groups of words, we can find complementary information.")
print("****************************************************************\n")

************************** Root Words **************************
Root words from corpus that have frequency higher than 15 :

 1 - By considering all the tokens (which could be repeated)
           0
the       58
and       53
i         44
of        39
to        37
romeo     32
in        28
you       27
benvolio  27
that      26
a         24
is        24
          24
me        23
by        23

 2 - By considering unique tokens (non repeated)
        0
love    4
sad     3
thrust  2
cut     2
romeo   2
hous    2
weapon  2
pass    2
blow    2
part    2
sever   2
ladi    2
give    2
brawl   2
glad    2


We can see that the first list is not really interesting to find useful information (most of them are prepositions or linkers).
We can just deduce that Romeo and Benvolio are part of the main characters of the book.
Also, because <<I>> and <<You>> are repeated quite often, we can deduce that there are some dialogs.
Nevertheless, this piece of information could have been obtained, knowing th