# Exercise 1

### Necessary imports

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from statistics import mean
from IPython.display import display, HTML
from nltk import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance
import numpy as np

tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')

### Dataset Loading

In [2]:
df = pd.read_csv('data/TLN-definitions-23.tsv', sep='\t')
df_door = df['door']
df_ladybug = df['ladybug']
df_pain = df['pain']
df_blurriness = df['blurriness']

### Pre-processing of dataset

In [3]:
# Stop words removal
df_door_nostop = df_door.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_ladybug_nostop = df_ladybug.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_pain_nostop = df_pain.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_blurriness_nostop = df_blurriness.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Stemming and tokenization (with puntuaction removal)
ps = PorterStemmer()
df_door_stem = df_door_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([ps.stem(word) for word in x.split()]).lower())
)
df_ladybug_stem = df_ladybug_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([ps.stem(word) for word in x.split()]).lower())
)
df_pain_stem = df_pain_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([ps.stem(word) for word in x.split()]).lower())
)
df_blurriness_stem = df_blurriness_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([ps.stem(word) for word in x.split()]).lower())
)

# Lemmatization and tokenization (with puntuaction removal)
lemmatizer = WordNetLemmatizer()
df_door_lem = df_door_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)
df_ladybug_lem = df_ladybug_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)
df_pain_lem = df_pain_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)
df_blurriness_lem = df_blurriness_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)

## First Method: Overlap of words in the definitions

### Function for calculating the overlap between the definitions of two words

In [4]:
def calculate_overlap(sentence_1: list, sentence_2: list) -> int:
    return len(set(sentence_1).intersection(set(sentence_2)))

In [5]:
def definitions_similarity(definitions: list) -> float:
    similarity_matrix = []
    for row in definitions:
        for row2 in definitions:
            if row == row2:
                continue
            overlap = calculate_overlap(row, row2)
            overlap = overlap / (len(row) if len(row) < len(row2) else len(row2))
            similarity_matrix.append(overlap)
    return round(mean(similarity_matrix), 2)

In [6]:
# Overlap between all definitions of door, ladybug, pain and blurriness with both stemming and lemmatization
door_similarity_stem = definitions_similarity(df_door_stem)
ladybug_similarity_stem = definitions_similarity(df_ladybug_stem)
pain_similarity_stem = definitions_similarity(df_pain_stem)
blurriness_similarity_stem = definitions_similarity(df_blurriness_stem)

door_similarity_lem = definitions_similarity(df_door_lem)
ladybug_similarity_lem = definitions_similarity(df_ladybug_lem)
pain_similarity_lem = definitions_similarity(df_pain_lem)
blurriness_similarity_lem = definitions_similarity(df_blurriness_lem)

# I create the dataframe for printing the table with the results of the similarity with stemming

data_stem =   [['pain\n {:.2f}'.format(pain_similarity_stem), 'door\n {:.2f}'.format(door_similarity_stem)], 
             ['bluriness\n {:.2f}'.format(blurriness_similarity_stem), 'ladybug\n {:.2f}'.format(ladybug_similarity_stem)]]
df_similarity_stem = pd.DataFrame(data_stem, columns=['Abstract', 'Concrete'], index=['Generic', 'Specific'])

print("Similarity between all definitions with Stemming")
display(HTML(df_similarity_stem.to_html().replace("\\n","<br>")))

# I create the dataframe for printing the table with the results of the similarity with lemmatization

data_lem =  [['pain\n {:.2f}'.format(pain_similarity_lem), 'door\n {:.2f}'.format(door_similarity_lem)],
                ['bluriness\n {:.2f}'.format(blurriness_similarity_lem), 'ladybug\n {:.2f}'.format(ladybug_similarity_lem)]]
df_similarity_lem = pd.DataFrame(data_lem, columns=['Abstract', 'Concrete'], index=['Generic', 'Specific'])
print("Similarity between all definitions with Lemmatization")
display(HTML(df_similarity_lem.to_html().replace("\\n","<br>")))

Similarity between all definitions with Stemming


Unnamed: 0,Abstract,Concrete
Generic,pain  0.22,door  0.20
Specific,bluriness  0.08,ladybug  0.56


Similarity between all definitions with Lemmatization


Unnamed: 0,Abstract,Concrete
Generic,pain  0.20,door  0.19
Specific,bluriness  0.08,ladybug  0.53


## Second Method: Similarity of definitions with CountVectorizer

### Pre-processing

In [7]:
# Lemmatization
tdidf_door_lem = df_door_nostop.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
tdidf_ladybug_lem = df_ladybug_nostop.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
tdidf_pain_lem = df_pain_nostop.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
tdidf_blurriness_lem = df_blurriness_nostop.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())

### Obtaining the CountVectorizer vectors

In [11]:
# Obtaining the CountVectorizer
vectorizer = CountVectorizer()
vectors_door = vectorizer.fit_transform(tdidf_door_lem)
vectors_ladybug = vectorizer.fit_transform(tdidf_ladybug_lem)
vectors_pain = vectorizer.fit_transform(tdidf_pain_lem)
vectors_blurriness = vectorizer.fit_transform(tdidf_blurriness_lem)

### Calculating the cosine similarity between the vectors of the same word

In [12]:
def vectors_similarity(vectors: list) -> float:
    similarity_matrix = []
    for row in vectors:
        for row2 in vectors:
            if np.array_equal(row, row2):
                continue
            similarity_matrix.append(distance.cosine(row, row2))
    return round(1 - mean(similarity_matrix), 2)

In [13]:
# Cosine similarity between all definitions of door, ladybug, pain and blurriness
door_similarity = vectors_similarity(vectors_door.toarray())
ladybug_similarity = vectors_similarity(vectors_ladybug.toarray())
pain_similarity = vectors_similarity(vectors_pain.toarray())
blurriness_similarity = vectors_similarity(vectors_blurriness.toarray())

# I create the dataframe for printing the table with the results of the similarity with CountVectorizer
data_cv = [['pain\n {:.2f}'.format(pain_similarity), 'door\n {:.2f}'.format(door_similarity)],
              ['bluriness\n {:.2f}'.format(blurriness_similarity), 'ladybug\n {:.2f}'.format(ladybug_similarity)]]
df_similarity_cv = pd.DataFrame(data_cv, columns=['Abstract', 'Concrete'], index=['Generic', 'Specific'])
print("Similarity between all definitions with CountVectorizer")
display(HTML(df_similarity_cv.to_html().replace("\\n","<br>")))

Similarity between all definitions with CountVectorizer


Unnamed: 0,Abstract,Concrete
Generic,pain  0.14,door  0.13
Specific,bluriness  0.05,ladybug  0.43
