In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
DATA_FPATH = r"C:\Users\HP\OneDrive\Desktop\quora.csv"

In [4]:
raw_data = pd.read_csv(DATA_FPATH, index_col=0)
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 404290 entries, 0 to 404289
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   qid1          404290 non-null  int64 
 1   qid2          404290 non-null  int64 
 2   question1     404289 non-null  object
 3   question2     404288 non-null  object
 4   is_duplicate  404290 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 18.5+ MB


In [5]:
raw_data.sample(3)

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
104585,14772,26185,Why do people ask questions on Quora instead o...,Why do people ask such questions here on Quora...,1
166971,233637,85579,Why should I not do a job in TCS?,How do I get a placement in TCS?,0
215050,215053,320818,How do I get started with web development?,How can I get started with web design?,0


In [6]:
raw_data.isnull().sum()

qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64

In [7]:
raw_data.dropna(inplace=True)

In [8]:
raw_data["is_duplicate"].value_counts(normalize=True)

0    0.630799
1    0.369201
Name: is_duplicate, dtype: float64

## Train-Test split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train, test = train_test_split(raw_data, test_size=0.3, stratify=raw_data["is_duplicate"])

## Data preprocessing

In [11]:
qid1 = raw_data.loc[:, ["qid1", "question1"]].drop_duplicates()\
               .rename(columns = {"qid1": "qid", "question1": "question"})
qid2 = raw_data.loc[:, ["qid2", "question2"]].drop_duplicates()\
               .rename(columns = {"qid2": "qid", "question2": "question"})
questions = pd.concat([qid1, qid2], ignore_index=True).drop_duplicates().set_index("qid")

In [12]:
questions.shape

(537929, 1)

In [13]:
questions.sample(5)

Unnamed: 0_level_0,question
qid,Unnamed: 1_level_1
139494,Which has been the most crucial time in humans...
468209,"How should I choose between mathematics, physi..."
402074,"Does marijuana increase risk of lung cancer, s..."
287476,How do I find local partners for sex?
447076,Which dog should I get as a pet in India?


In [14]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [15]:
## initialise the inbuilt Stemmer
stemmer = PorterStemmer()

In [16]:
## We can also use Lemmatizer instead of Stemmer
lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess(raw_text, flag):
    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", raw_text)
    
    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()
    
    # remove stop words                
    clean_tokens = [t for t in tokens if not t in stopwords.words("english")]
    
    # Stemming/Lemmatization
    if(flag == 'stem'):
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])

In [18]:
stemming_df = questions["question"].progress_apply(lambda x: preprocess(x, 'stem'))
stemming_df.columns = ['clean_text_stem', 'text_length_stem']

100%|█████████████████████████████████████████████████████████████████████████| 537929/537929 [37:21<00:00, 239.99it/s]


In [23]:
nltk.download("wordnet")
nltk.download("omw-1.4") 

[nltk_data] Error loading wordnet: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>
[nltk_data] Error loading omw-1.4: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [22]:
lemmatizing_df = questions["question"].progress_apply(lambda x: preprocess(x, 'lemma'))
lemmatizing_df.columns = ['clean_text_lemma', 'text_length_lemma']

  0%|                                                                            | 1/537929 [00:00<99:19:56,  1.50it/s]


BadZipFile: File is not a zip file

In [21]:
preprocessed_questions = pd.concat([questions, stemming_df, lemmatizing_df], axis=1)

In [23]:
with open("preprocessed_questions.pkl", "wb") as f:
    pickle.dump(preprocessed_questions, f)