# Evaluating ML models over text similarity predictions and ranking them using TOPSIS


In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train = train.drop(columns = ['Unnamed: 0', 'ticker_x', 'ticker_y'])
train['same_security'] = train['same_security'].astype(int)
train.head()

Unnamed: 0,description_x,description_y,same_security
0,first trust dow jones internet,first trust dj internet idx,1
1,schwab intl large company index etf,schwab strategic tr fundamental intl large co ...,1
2,vanguard small cap index adm,vanguard small-cap index fund inst,0
3,duke energy corp new com new isin #us4 sedol #...,duke energy corp new com new isin #us26441c204...,1
4,visa inc class a,visa inc.,1


In [4]:
test = test.drop('test_id', axis = 1)
test.head()

Unnamed: 0,description_x,description_y,same_security
0,semtech corp,semtech corporation,
1,vanguard mid cap index,vanguard midcap index - a,
2,spdr gold trust gold shares,spdr gold trust spdr gold shares,
3,vanguard total bond index adm,vanguard total bond market index,
4,oakmark international fund class i,oakmark international cl i,


In [5]:
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import nltk

In [6]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Lexical Similarity - Jaccard Similarity

In [7]:
def jaccard_similarity(text1, text2):
  words1 = set(word_tokenize(text1))
  words2 = set(word_tokenize(text2))
  intersection = len(words1 & words2)
  union = len(words1 | words2)
  return intersection / union if union != 0 else 0

In [8]:
trainRows = train.shape[0]
testRows = test.shape[0]

lexical_train = []
for i in range(trainRows):
  lexical_train.append(jaccard_similarity(train.iloc[i]['description_x'], train.iloc[i]['description_y']))

lexical_test = []
for i in range(testRows):
  lexical_test.append(jaccard_similarity(test.iloc[i]['description_x'], test.iloc[i]['description_y']))

train['lexical'] = lexical_train
test['lexical'] = lexical_test

## Semantic Similarity - Word2Vec & Cosine Similarity

In [9]:
model = api.load('word2vec-google-news-300')

In [10]:
def text_vector(text, model):
  words = word_tokenize(text.lower())
  word_vectors = [model[word] for word in words if word in model]

  if not word_vectors:
    return np.zeros(300)

  return np.mean(word_vectors, axis = 0)

def semantic_similarity(text1, text2, model):
  vec1 = text_vector(text1, model)
  vec2 = text_vector(text2, model)
  return cosine_similarity([vec1], [vec2])[0][0]

In [11]:
semantic_train = []
for i in range(trainRows):
  semantic_train.append(semantic_similarity(train.iloc[i]['description_x'], train.iloc[i]['description_y'], model))

semantic_test = []
for i in range(testRows):
  semantic_test.append(semantic_similarity(test.iloc[i]['description_x'], test.iloc[i]['description_y'], model))

train['semantic'] = semantic_train
test['semantic'] = semantic_test

## Handling imbalanced classification

In [12]:
X_train = train[['lexical', 'semantic']]
y_train = train[['same_security']]
X_test = test[['lexical', 'semantic']]

In [13]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(sampling_strategy = 'minority', random_state = 42)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.25, random_state = 42)

In [15]:
X_train.head()

Unnamed: 0,lexical,semantic
2402,0.111111,0.354368
1936,0.5,0.736145
811,0.333333,0.095054
1179,0.4,0.860229
1090,0.090909,0.553365


In [16]:
y_train.head()

Unnamed: 0,same_security
2402,0
1936,1
811,1
1179,1
1090,0


In [17]:
X_val.head()

Unnamed: 0,lexical,semantic
1429,0.333333,0.60553
346,0.666667,0.954962
1391,0.222222,0.288611
393,0.5,0.861415
194,0.2,0.429784


In [18]:
y_val.head()

Unnamed: 0,same_security
1429,1
346,1
1391,1
393,1
194,0


## Creating a data set for TOPSIS

In [31]:
accuracies = []
precisions = []
recalls = []
roc = []
log_losses = []

In [20]:
# Machine learning models used
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [21]:
# Evaluation metrics used
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, log_loss

In [22]:
X_train = X_train.values
y_train = y_train.values.flatten()
X_val = X_val.values
y_val = y_val.values.flatten()

In [32]:
models = [LogisticRegression(), GaussianNB(), KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier()]

for model in models:
  model.fit(X_train, y_train)
  accuracies.append(accuracy_score(y_val, model.predict(X_val)))
  precisions.append(precision_score(y_val, model.predict(X_val)))
  recalls.append(recall_score(y_val, model.predict(X_val)))
  roc.append(roc_auc_score(y_val, model.predict(X_val)))
  log_losses.append(log_loss(y_val, model.predict(X_val)))

In [34]:
dataset = pd.DataFrame({'Model': ['Logistic Regression', 'Naive Bayes', 'KNN', 'Decision Tree', 'Random Forest'],
                        'Accuracy': accuracies,
                        'Precision': precisions,
                        'Recall': recalls,
                        'ROC': roc,
                        'Log Loss': log_losses})
dataset

Unnamed: 0,Model,Accuracy,Precision,Recall,ROC,Log Loss
0,Logistic Regression,0.607187,0.627027,0.564477,0.607996,14.158412
1,Naive Bayes,0.612144,0.635359,0.559611,0.613139,13.979757
2,KNN,0.757125,0.824773,0.664234,0.758884,8.754097
3,Decision Tree,0.890954,0.962751,0.817518,0.892345,3.930411
4,Random Forest,0.876084,0.978462,0.773723,0.878023,4.466376


## Applying TOPSIS on the created data set

In [35]:
# TOPSIS using Topsis_Mandar_102203163 package
# For more details - https://pypi.org/project/Topsis-Mandar-102203163/

!pip install Topsis_Mandar_102203163

Collecting Topsis_Mandar_102203163
  Downloading Topsis_Mandar_102203163-1.0-py3-none-any.whl.metadata (2.2 kB)
Downloading Topsis_Mandar_102203163-1.0-py3-none-any.whl (3.5 kB)
Installing collected packages: Topsis_Mandar_102203163
Successfully installed Topsis_Mandar_102203163-1.0


In [37]:
import topsis as tp
dataset.to_csv('models.csv', index = False)
tp.topsis('models.csv', "1,1,1,1,1", "+,+,+,+,-", 'models-result.csv')

In [40]:
# Since decision Tree classification ranks first, we will use it to predict using test data
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [42]:
y_test = list(map(bool, model.predict(X_test.values)))
y_test

[True,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 False,
 True,
 True,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 True,
 False,
 True,
 False,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 False,
 True,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 Tru