In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk

In [2]:
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
import matplotlib.pyplot as plt
df = pd.read_csv("questions.csv")


In [5]:
df

Unnamed: 0,question
0,What is the step by step guide to invest in sh...
1,What is the story of Kohinoor (Koh-i-Noor) Dia...
2,How can I increase the speed of my internet co...
3,Why am I mentally very lonely? How can I solve...
4,"Which one dissolve in water quikly sugar, salt..."
...,...
4994,Who is currently winning the presidential elec...
4995,"What has a better ROI, marketing on radio sta..."
4996,Which mobile is good for 50k?
4997,Is the character Jane in the movie Predestinat...


In [6]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to C:\Users\Harish
[nltk_data]     Gorla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Data cleaning and preprocessing

import nltk
nltk.download('stopwords')
nltk.download('punkt')
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove digits
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords]
    # Stemming
    stemmer = SnowballStemmer('english')
    tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(tokens)



[nltk_data] Downloading package stopwords to C:\Users\Harish
[nltk_data]     Gorla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Harish
[nltk_data]     Gorla\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
#df['cleaned_text'] = df['question'].apply(clean_text)
df['cleaned_text'] = df['question'].apply(clean_text)

In [9]:
from sklearn.model_selection import train_test_split
# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [10]:
# Feature extraction
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_df['cleaned_text'])
tfidf_matrix_test = tfidf_vectorizer.transform(test_df['cleaned_text'])


In [11]:
# Similarity calculation
similarity_matrix_train = cosine_similarity(tfidf_matrix_train)
similarity_matrix_train[100]

array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
       0.06832973])

In [12]:
# Clustering
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix_train)
clusters_train = km.labels_.tolist()


In [13]:
# Duplicate detection
import seaborn as sns

train_questions = train_df['question'].tolist()
clustered_questions_train = {i: [] for i in range(num_clusters)}
for i, cluster in enumerate(clusters_train):
    clustered_questions_train[cluster].append(train_questions[i])
    
duplicates_train = []

In [14]:

for cluster in clustered_questions_train:
    if len(clustered_questions_train[cluster]) > 0.1:
        group = clustered_questions_train[cluster]
        pairwise_similarity = similarity_matrix_train[np.ix_([train_questions.index(q) for q in group], [train_questions.index(q) for q in group])]
        indices = np.argmin(pairwise_similarity, axis=1)
        duplicates_train.extend([(group[i], group[j]) for i, j in zip(range(len(group)), indices) if i < indices[i]])

In [15]:
# Print the duplicates in the training set
print("Duplicate questions in the training set:")
for pair in duplicates_train:
    print(pair)

Duplicate questions in the training set:
('I have a job in Timor Leste Australian base company and I can get a PR after my 8 month job. My question, is it possible to go Australia with PR?', 'What is it like to work at Goldman Sachs in London?')
("How beautiful will be the world if money won't exists (I mean you get everything you want for free)?", 'What is it like to work at Goldman Sachs in London?')
('What are some special cares for someone with a nose that gets stuffy during the night?', 'What is it like to work at Goldman Sachs in London?')
('How does it feel to be unemployed after getting an engineering degree?', 'What is it like to work at Goldman Sachs in London?')
('How do you get deleted Instagram chats?', 'What is it like to work at Goldman Sachs in London?')
('How can I get free gems in Clash of Clans?', 'What is it like to work at Goldman Sachs in London?')
('Is there any way to get rid of gynecomastia?', 'What is it like to work at Goldman Sachs in London?')
("My family d