In [None]:
pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


Load Libraries

In [None]:
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, strip_tags, strip_multiple_whitespaces, remove_stopwords, stem_text
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from gensim.models import KeyedVectors
import gensim
import numpy as np
from fuzzywuzzy import fuzz



Load Data

In [None]:
# Read CSV file
file_path = '/content/drive/MyDrive/train.csv'
data = pd.read_csv(file_path)

In [None]:
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


Data Analysis

In [None]:
data.shape

(404290, 6)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            404290 non-null  int64 
 1   qid1          404290 non-null  int64 
 2   qid2          404290 non-null  int64 
 3   question1     404289 non-null  object
 4   question2     404288 non-null  object
 5   is_duplicate  404290 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [None]:
data.isnull().sum()

id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [None]:
data.dropna()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


Text Preprocessing

In [None]:
custom_filters = [strip_tags, strip_multiple_whitespaces, remove_stopwords, stem_text]

def get_tokenized_questions(X):
    series = pd.Series(pd.concat([X['question1'], X['question2']]), dtype=str)
     # Iterate over each question in the series
    for question in series:
      # Yield the preprocessed (tokenized) question using the custom filters
        yield preprocess_string(question, custom_filters)


In [None]:
tokenized_questions = list(get_tokenized_questions(data))
print(tokenized_questions[:10])

[['what', 'step', 'step', 'guid', 'invest', 'share', 'market', 'india?'], ['what', 'stori', 'kohinoor', '(koh-i-noor)', 'diamond?'], ['how', 'i', 'increas', 'speed', 'internet', 'connect', 'vpn?'], ['why', 'i', 'mental', 'lonely?', 'how', 'i', 'solv', 'it?'], ['which', 'dissolv', 'water', 'quikli', 'sugar,', 'salt,', 'methan', 'carbon', 'di', 'oxide?'], ['astrology:', 'i', 'capricorn', 'sun', 'cap', 'moon', 'cap', 'rising...what', 'me?'], ['should', 'i', 'bui', 'tiago?'], ['how', 'i', 'good', 'geologist?'], ['when', 'us', 'シ', 'instead', 'し?'], ['motorola', '(company):', 'can', 'i', 'hack', 'charter', 'motorolla', 'dcx3400?']]


Embedding

In [None]:
model_w2v = gensim.models.Word2Vec(sentences=tokenized_questions, vector_size=300)

fasttext_path = '/content/drive/MyDrive/Google Techmakeres/wiki-news-300d-1M.vec'
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_path)

#Add FastText vectors to Word2Vec model
for word in fasttext_model.index_to_key:
    if word in model_w2v.wv:
       model_w2v.wv[word] = fasttext_model[word]

model_w2v.train(tokenized_questions, total_examples=model_w2v.corpus_count, epochs=10)



(43032659, 53071390)

In [None]:
data['question1'] = data['question1'].fillna('')
data['question2'] = data['question2'].fillna('')

Train-Test Split

In [None]:
X = data[['question1', 'question2']]
y = data['is_duplicate']
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.30, random_state=42)

Feature Extraction

In [None]:
# ratio
compute_ratio = lambda row: fuzz.ratio(str(row['question1']), str(row['question2']))
# partial ratio
compute_partial_ratio = lambda row: fuzz.partial_ratio(str(row['question1']), str(row['question2']))
# token_sort_ratio
compute_token_sort_ratio = lambda row: fuzz.token_sort_ratio(str(row['question1']), str(row['question2']))
# token_set_ratio
compute_token_set_ratio = lambda row: fuzz.token_set_ratio(str(row['question1']), str(row['question2']))

# method to compute fuzzywuzzy metric on each row
def compute_fuzzy_metrics(X, method):
    return X.apply(method, axis=1)

# ratio
ratio = compute_fuzzy_metrics(X_train, compute_ratio)
# partial ratio
partial_ratio = compute_fuzzy_metrics(X_train, compute_partial_ratio)
# token_sort_ratio
token_sort_ratio = compute_fuzzy_metrics(X_train, compute_token_sort_ratio)
# token_set_ratio
token_set_ratio = compute_fuzzy_metrics(X_train, compute_token_set_ratio)


Training the Model

In [None]:
# Compute fuzzywuzzy metrics for X_test as well
ratio_test = compute_fuzzy_metrics(X_test, compute_ratio)
partial_ratio_test = compute_fuzzy_metrics(X_test, compute_partial_ratio)
token_sort_ratio_test = compute_fuzzy_metrics(X_test, compute_token_sort_ratio)
token_set_ratio_test = compute_fuzzy_metrics(X_test, compute_token_set_ratio)

# Create a DataFrame for train and test features
train_features = pd.DataFrame({
    'ratio': ratio,
    'partial_ratio': partial_ratio,
    'token_sort_ratio': token_sort_ratio,
    'token_set_ratio': token_set_ratio
})

test_features = pd.DataFrame({
    'ratio': ratio_test,
    'partial_ratio': partial_ratio_test,
    'token_sort_ratio': token_sort_ratio_test,
    'token_set_ratio': token_set_ratio_test
})

# Train Logistic Regression Model
lr = LogisticRegression()
lr.fit(train_features, y_train)

# Make predictions
y_pred = lr.predict(test_features)

Evaluating the model

In [None]:
#Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
#Confustion Matric
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

#Precision
precision = precision_score(y_test, y_pred)
print('Precision: %f' % precision)
# recall
recall = recall_score(y_test, y_pred)
print('Recall: %f' % recall)
# f1 score
f1 = f1_score(y_test, y_pred)
print('F1 score: %f' % f1)

Accuracy: 0.6584877192114571
Confusion Matrix:
[[59255 17095]
 [24326 20611]]
Precision: 0.546624
Recall: 0.458664
F1 score: 0.498796
