# Exercise 1

### Necessary imports

In [98]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from statistics import mean
from IPython.display import display, HTML
from nltk import WordNetLemmatizer

tokenizer = RegexpTokenizer(r'\w+')
stop_words = stopwords.words('english')

### Dataset Loading

In [93]:
df = pd.read_csv('data/TLN-definitions-23.tsv', sep='\t')
df_door = df['door']
df_ladybug = df['ladybug']
df_pain = df['pain']
df_blurriness = df['blurriness']

### Pre-processing of dataset

In [101]:
# Stop words removal
df_door_nostop = df_door.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_ladybug_nostop = df_ladybug.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_pain_nostop = df_pain.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_blurriness_nostop = df_blurriness.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Stemming and tokenization (with puntuaction removal)
ps = PorterStemmer()
df_door_stem = df_door_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([ps.stem(word) for word in x.split()]).lower())
)
df_ladybug_stem = df_ladybug_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([ps.stem(word) for word in x.split()]).lower())
)
df_pain_stem = df_pain_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([ps.stem(word) for word in x.split()]).lower())
)
df_blurriness_stem = df_blurriness_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([ps.stem(word) for word in x.split()]).lower())
)

# Lemmatization and tokenization (with puntuaction removal)
lemmatizer = WordNetLemmatizer()
df_door_lem = df_door_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)
df_ladybug_lem = df_ladybug_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)
df_pain_lem = df_pain_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)
df_blurriness_lem = df_blurriness_nostop.apply(
    lambda x: tokenizer.tokenize(' '.join([lemmatizer.lemmatize(word) for word in x.split()]).lower())
)

### Function for calculating the overlap between the definitions of two words

In [95]:
def calculate_overlap(set1: list, set2: list) -> int:
    return len(set(set1).intersection(set(set2)))

In [96]:
def definitions_similarity(definitions: list) -> float:
    similarity_matrix = []
    for row in definitions:
        for row2 in definitions:
            if row == row2:
                continue
            overlap = calculate_overlap(row, row2)
            overlap = overlap / (len(row) if len(row) < len(row2) else len(row2))
            similarity_matrix.append(overlap)
    return round(mean(similarity_matrix), 2)

In [105]:
# Overlap between all definitions of door, ladybug, pain and blurriness with both stemming and lemmatization
door_similarity_stem = definitions_similarity(df_door_stem)
ladybug_similarity_stem = definitions_similarity(df_ladybug_stem)
pain_similarity_stem = definitions_similarity(df_pain_stem)
blurriness_similarity_stem = definitions_similarity(df_blurriness_stem)

door_similarity_lem = definitions_similarity(df_door_lem)
ladybug_similarity_lem = definitions_similarity(df_ladybug_lem)
pain_similarity_lem = definitions_similarity(df_pain_lem)
blurriness_similarity_lem = definitions_similarity(df_blurriness_lem)

# I create the dataframe for printing the table with the results of the similarity with stemming

data_stem =   [['pain\n {:.2f}'.format(pain_similarity_stem), 'door\n {:.2f}'.format(door_similarity_stem)], 
             ['bluriness\n {:.2f}'.format(blurriness_similarity_stem), 'ladybug\n {:.2f}'.format(ladybug_similarity_stem)]]
df_similarity_stem = pd.DataFrame(data_stem, columns=['Abstract', 'Concrete'], index=['Generic', 'Specific'])

print("Similarity between all definitions with Stemming")
display(HTML(df_similarity_stem.to_html().replace("\\n","<br>")))

# I create the dataframe for printing the table with the results of the similarity with lemmatization

data_lem =  [['pain\n {:.2f}'.format(pain_similarity_lem), 'door\n {:.2f}'.format(door_similarity_lem)],
                ['bluriness\n {:.2f}'.format(blurriness_similarity_lem), 'ladybug\n {:.2f}'.format(ladybug_similarity_lem)]]
df_similarity_lem = pd.DataFrame(data_lem, columns=['Abstract', 'Concrete'], index=['Generic', 'Specific'])
print("Similarity between all definitions with Lemmatization")
display(HTML(df_similarity_lem.to_html().replace("\\n","<br>")))

Similarity between all definitions with Stemming


Unnamed: 0,Abstract,Concrete
Generic,pain  0.22,door  0.20
Specific,bluriness  0.08,ladybug  0.56


Similarity between all definitions with Lemmatization


Unnamed: 0,Abstract,Concrete
Generic,pain  0.20,door  0.19
Specific,bluriness  0.08,ladybug  0.53
