## Finding the most similar sentence

In [1]:
import numpy as np
import pandas as pd

In [2]:
# read csv from dataset directory

df = pd.read_csv('dataset/sentences.csv')

In [3]:
df

Unnamed: 0,Sentences
0,Machine learning is simply how computers �thin...
1,It is a subset of artificial intelligence that...
2,The main objective of Machine Learning algorit...
3,Machine Learning is programming computers to o...
4,It allows a computer to extract information fr...
5,"Essentially, it�s about machines making sense ..."
6,Deep Learning�is the field where�the machines ...
7,Deep learning is a subset of machine learning ...
8,"Machine learning is essentially how, without b..."


<strong>About the dataset</strong>

* Index 0-5 answers the question what is machine learning.
* Index 6 and 7 answers the question what is deep learning.
* Index 8 is just a paraphrased sentence from index 0.


In [4]:
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
def to_corpus(df):
    corpus = []
    lemmatizer = WordNetLemmatizer()

    for row in df:
        # Remove tabs and new lines
        removed_tabs_newline = re.sub('[\n|\t]',' ',row)

        # Remove special characters and digits
        removed_spchar_digits = re.sub('[^a-zA-Z]',' ',removed_tabs_newline)

        # Convert sentences into lower case
        lower_case = removed_spchar_digits.lower()

        # Tokenize the sentences by words / split by words
        tokenized_sentences = lower_case.split()

        # Remove stopwords
        filtered_words = [word for word in tokenized_sentences if word not in stopwords.words('english')]

        # Lemmetize words
        lemmetized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

        # Build corpus
        sentence = ' '.join(lemmetized_words)
        corpus.append(sentence)
        
    return corpus

In [6]:
corpus = to_corpus(df['Sentences'])

# convert sentences to vectors using tfidf
tfidf = TfidfVectorizer()
vectors = tfidf.fit_transform(corpus).toarray()
feature_names = tfidf.get_feature_names()

In [7]:
pd.DataFrame(vectors,columns=feature_names)

Unnamed: 0,algorithm,allows,analyze,artificial,automatically,brain,computer,data,decision,deep,...,programming,sense,simply,subset,task,teach,think,using,way,without
0,0.0,0.0,0.0,0.0,0.0,0.0,0.26765,0.0,0.0,0.0,...,0.0,0.0,0.412495,0.0,0.412495,0.0,0.3484,0.0,0.0,0.302924
1,0.20621,0.0,0.280799,0.237167,0.280799,0.0,0.0,0.162579,0.280799,0.0,...,0.0,0.0,0.0,0.237167,0.0,0.0,0.0,0.0,0.0,0.20621
2,0.332728,0.0,0.0,0.0,0.0,0.0,0.293984,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.453081,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.229986,0.205221,0.0,0.0,...,0.354449,0.0,0.0,0.0,0.0,0.0,0.0,0.354449,0.0,0.0
4,0.0,0.473004,0.0,0.0,0.0,0.0,0.306911,0.273862,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.242737,0.0,0.0,...,0.0,0.419247,0.0,0.0,0.0,0.0,0.0,0.0,0.419247,0.0
6,0.0,0.0,0.0,0.0,0.0,0.387178,0.0,0.0,0.0,0.387178,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.246563,0.0,0.0,0.283578,0.0,0.283578,0.0,0.194393,0.0,0.283578,...,0.0,0.0,0.0,0.283578,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.349846,0.0,0.0,0.304181


In [8]:
# convert sparse matrix to scipy csr matrix
# this saves memory and speeds up computation

from scipy.sparse import csr_matrix
csr_rating_matrix =  csr_matrix(vectors)

In [9]:
print(csr_rating_matrix)

  (0, 6)	0.2676496751937805
  (0, 12)	0.41249527372472655
  (0, 25)	0.21446074638866056
  (0, 26)	0.19335234289339728
  (0, 40)	0.34840007530749767
  (0, 43)	0.41249527372472655
  (0, 45)	0.41249527372472655
  (0, 47)	0.34840007530749767
  (0, 50)	0.3029238083090619
  (1, 0)	0.20621028336493716
  (1, 2)	0.2807992140211338
  (1, 3)	0.23716748661836806
  (1, 4)	0.2807992140211338
  (1, 7)	0.1625785559621715
  (1, 8)	0.2807992140211338
  (1, 16)	0.1821980110191022
  (1, 18)	0.2807992140211338
  (1, 21)	0.2807992140211338
  (1, 22)	0.2807992140211338
  (1, 23)	0.2807992140211338
  (1, 24)	0.1821980110191022
  (1, 28)	0.2807992140211338
  (1, 31)	0.2807992140211338
  (1, 44)	0.23716748661836806
  (1, 50)	0.20621028336493716
  :	:
  (6, 17)	0.45840699013050296
  (6, 24)	0.29743972799277435
  (6, 25)	0.23833074344210098
  (6, 26)	0.21487292385219428
  (7, 0)	0.24656318221835213
  (7, 3)	0.2835783418029886
  (7, 5)	0.2835783418029886
  (7, 7)	0.1943932449166763
  (7, 9)	0.2835783418029886
  (7

In [10]:
from sklearn.neighbors import NearestNeighbors

# find nearest neighbors using cosine similarity
nearest = NearestNeighbors(metric='cosine')
# fit the csr matrix to the algorithm
nearest.fit(csr_rating_matrix)

NearestNeighbors(metric='cosine')

In [11]:
def find_nearest(dataset,vectors,threshold=None):
    sentence_index = []
    most_similar_indices = []
    percentages = []
    for index in range(len(vectors)):
        # find two nearest neighbors - the first nearest neighbor is itself
        distances, indices = nearest.kneighbors(vectors[index].reshape(1,-1),n_neighbors=2)
        similarity_percentage = 1 - distances[0][1]
        if threshold == None:    
            sentence_index.append(index)
            most_similar_indices.append(indices[0][1]) # index of most similar sentence
            percentages.append(f'{round(similarity_percentage*100,2)}%')
        else:
            if similarity_percentage >= threshold: # if threshold is specified
                sentence_index.append(index)
                most_similar_indices.append(indices[0][1])
                percentages.append(f'{round(similarity_percentage*100,2)}%')
    data = {
            'Sentence':dataset.iloc[sentence_index]['Sentences'],
            'Most similar':most_similar_indices,
            'Percentage':percentages
            }
    return pd.DataFrame(data)

In [12]:
find_nearest(df,vectors)

Unnamed: 0,Sentence,Most similar,Percentage
0,Machine learning is simply how computers �thin...,8,45.72%
1,It is a subset of artificial intelligence that...,7,29.63%
2,The main objective of Machine Learning algorit...,7,26.17%
3,Machine Learning is programming computers to o...,4,24.64%
4,It allows a computer to extract information fr...,3,24.64%
5,"Essentially, it�s about machines making sense ...",8,20.02%
6,Deep Learning�is the field where�the machines ...,7,46.62%
7,Deep learning is a subset of machine learning ...,6,46.62%
8,"Machine learning is essentially how, without b...",0,45.72%


In [13]:
# passing threshold to filter sentences with greater than 30% similarity

find_nearest(df,vectors,0.3)

Unnamed: 0,Sentence,Most similar,Percentage
0,Machine learning is simply how computers �thin...,8,45.72%
6,Deep Learning�is the field where�the machines ...,7,46.62%
7,Deep learning is a subset of machine learning ...,6,46.62%
8,"Machine learning is essentially how, without b...",0,45.72%


In [14]:
# load another data in dataset directory

df = pd.read_csv('dataset/description.csv')

In [15]:
df

Unnamed: 0,Sentences
0,Cancel ASN WMS Cancel ASN
1,MAXPREDO Validation is corect
2,Move to QC
3,Cancel ASN WMS Cancel ASN
4,MAXPREDO Validation is right
5,Verify files are sent every hours for this int...
6,MAXPREDO Validation are correct
7,Move to QC
8,Verify files are not sent


In [16]:
corpus = to_corpus(df['Sentences'])

tfidf = TfidfVectorizer()
vectors = tfidf.fit_transform(corpus).toarray()
feature_names = tfidf.get_feature_names()

In [17]:
pd.DataFrame(vectors,columns=feature_names)

Unnamed: 0,asn,cancel,corect,correct,every,file,hour,interface,maxpredo,move,optimum,qc,right,sent,validation,verify,wms
0,0.666667,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
1,0.0,0.0,0.693609,0.0,0.0,0.0,0.0,0.0,0.509365,0.0,0.0,0.0,0.0,0.0,0.509365,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.707107,0.0,0.0,0.0,0.0,0.0
3,0.666667,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.509365,0.0,0.0,0.0,0.693609,0.0,0.509365,0.0,0.0
5,0.0,0.0,0.0,0.0,0.403563,0.340856,0.403563,0.403563,0.0,0.0,0.403563,0.0,0.0,0.340856,0.0,0.340856,0.0
6,0.0,0.0,0.0,0.693609,0.0,0.0,0.0,0.0,0.509365,0.0,0.0,0.0,0.0,0.0,0.509365,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.707107,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.57735,0.0


In [18]:
csr_rating_matrix =  csr_matrix(vectors)

In [19]:
nearest = NearestNeighbors(metric='cosine')
# fit the csr matrix to the algorithm
nearest.fit(csr_rating_matrix)

NearestNeighbors(metric='cosine')

In [20]:
find_nearest(df,vectors)

Unnamed: 0,Sentence,Most similar,Percentage
0,Cancel ASN WMS Cancel ASN,3,100.0%
1,MAXPREDO Validation is corect,4,51.89%
2,Move to QC,7,100.0%
3,Cancel ASN WMS Cancel ASN,3,100.0%
4,MAXPREDO Validation is right,1,51.89%
5,Verify files are sent every hours for this int...,8,59.04%
6,MAXPREDO Validation are correct,1,51.89%
7,Move to QC,7,100.0%
8,Verify files are not sent,5,59.04%
