We connect to drive here. My data is Saved in My Drive/Course Work/questions.csv


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


isnull is used to check for any missing value

In [1]:
import pandas as pd
data = pd.read_csv('/content/questions.csv')
print(data.head())

# Check for missing values
print(data.isnull().sum())


   id  qid1  qid2                                          question1  \
0   0     1     2  What is the step by step guide to invest in sh...   
1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   
2   2     5     6  How can I increase the speed of my internet co...   
3   3     7     8  Why am I mentally very lonely? How can I solve...   
4   4     9    10  Which one dissolve in water quikly sugar, salt...   

                                           question2  is_duplicate  
0  What is the step by step guide to invest in sh...             0  
1  What would happen if the Indian government sto...             0  
2  How can Internet speed be increased by hacking...             0  
3  Find the remainder when [math]23^{24}[/math] i...             0  
4            Which fish would survive in salt water?             0  
id              0
qid1            0
qid2            0
question1       1
question2       2
is_duplicate    0
dtype: int64


#Data Preprocessing
1. lower the text
2. remove special characters and numbers
3. tokenize: splitting text into individual words or tokens.
4. Stemming: It cuts off prefixes and/or endings of words based on common ones. It's faster but less accurate. For example, "running" → "run".
Lemmatization: This involves a more sophisticated analysis to accurately convert words to their base form. For example, "better" → "good".


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    if not isinstance(text, str):
        return ""


    text = text.lower()

    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in filtered_tokens]

    # Reconstruct the text from tokens
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
data['question1'] = data['question1'].apply(preprocess_text)
data['question2'] = data['question2'].apply(preprocess_text)


#Vectorizer
1. we combine the data (q1 and q2)
2.CountVectorizer:feature extraction method from the scikit-learn library.It converts a collection of text documents into a matrix of token counts. Essentially, it counts the number of times each word appears in the document.CountVectorizer automates uses BoW. When you fit a CountVectorizer to a corpus of text documents, it performs these steps:

It tokenizes the text by separating the words.
It builds a vocabulary of unique words. In this vocabulary, each word is assigned a unique index.
It transforms each document into a numerical vector. Each element in this vector corresponds to a word in the vocabulary and the value is the count of that word in the document.


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Combine questions
data['combined_questions'] = data['question1'] + ' ' + data['question2']

# TF-IDF Vectorizer
# vectorizer = TfidfVectorizer()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['combined_questions'])

y = data['is_duplicate']


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.79      0.78     50998
           1       0.63      0.61      0.62     29873

    accuracy                           0.73     80871
   macro avg       0.71      0.70      0.70     80871
weighted avg       0.72      0.73      0.73     80871



In [8]:
print(X_train.shape)

(323480, 90293)


In [9]:
from scipy.sparse import hstack

In [10]:

data['question1'] = data['question1'].apply(preprocess_text)
data['question2'] = data['question2'].apply(preprocess_text)

In [11]:
# vectorizer1 = CountVectorizer()
# vectorizer2 = CountVectorizer()
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer1 = TfidfVectorizer()
vectorizer2 = TfidfVectorizer()
# Fit and transform the questions
X1 = vectorizer1.fit_transform(data['question1'])
X2 = vectorizer2.fit_transform(data['question2'])

In [12]:
# Calculate the absolute difference in lengths of questions
data['length_diff'] = abs(data['question1'].str.split().str.len() - data['question2'].str.split().str.len())


In [13]:
# Convert length difference to a sparse matrix and concatenate with BoW features
from scipy.sparse import csr_matrix
length_diff_sparse = csr_matrix(data['length_diff']).transpose()

X = hstack([X1, X2, length_diff_sparse])


In [14]:
y = data['is_duplicate']

In [15]:
# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.78      0.79      0.78     50998
           1       0.63      0.61      0.62     29873

    accuracy                           0.73     80871
   macro avg       0.71      0.70      0.70     80871
weighted avg       0.72      0.73      0.73     80871



Example of vectorizer

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Train
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.77      0.85      0.81     50998
           1       0.68      0.56      0.62     29873

    accuracy                           0.74     80871
   macro avg       0.72      0.70      0.71     80871
weighted avg       0.74      0.74      0.74     80871



In [17]:
from sklearn.feature_extraction.text import CountVectorizer

# List of text documents
documents = ["Cat cat sat on the mat", "The cat is named Fluffy", "Fluffy is not a cat but a dog"]

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the model and transform the documents
X = vectorizer.fit_transform(documents)

# Convert the result to an array
print(X.toarray())

# Get the feature names
print(vectorizer.get_feature_names_out())


[[0 2 0 0 0 1 0 0 1 1 1]
 [0 1 0 1 1 0 1 0 0 0 1]
 [1 1 1 1 1 0 0 1 0 0 0]]
['but' 'cat' 'dog' 'fluffy' 'is' 'mat' 'named' 'not' 'on' 'sat' 'the']
