# Text Data Cleaning and EDA Overview
This notebook documents the process of cleaning and exploring the Quora question pairs dataset. The goal is to identify and address common text issues to prepare the data for downstream NLP tasks. Each step below highlights a specific problem, the rationale for addressing it and the solution applied.

In [None]:
import pandas as pd
import re
import contractions
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from tqdm.notebook import tqdm
import unicodedata

In [None]:
# Set display options for pandas
pd.set_option('display.max_colwidth',1000)
pd.set_option('display.max_rows', 100)

# Download necessary NLTK resources
nltk.download('stopwords')

# Set up tqdm for pandas
tqdm.pandas()

In [None]:
df = pd.read_csv('quora.csv')
df['question1'] = df['question1'].astype(str)
df['question2'] = df['question2'].astype(str)

Now we will scan the dataframe to see what needs cleaning and what doesn't.

## Step 1: Detecting and Handling HTML Tags
Text data scraped from the web often contains HTML tags, which can interfere with NLP models. Here, we identify questions containing HTML and replace tags with descriptive tokens to preserve information while removing markup.

In [None]:
# Problem: HTML tags
# detect using regex: starts with < followed by a letter and ends with >
html_pattern = re.compile(r'</?[a-zA-Z][^>]*>')

# Check for HTML in both columns
q1_html = df[df['question1'].astype(str).apply(lambda x: bool(html_pattern.search(x)))]
q2_html = df[df['question2'].astype(str).apply(lambda x: bool(html_pattern.search(x)))]

# Display summary
print(f"Total questions with HTML tags in question1: {len(q1_html)}")
print(f"Total questions with HTML tags in question2: {len(q2_html)}")

# Concatenate for side-by-side view (not preserving pairs)
html_examples = pd.concat([q1_html['question1'].reset_index(drop=True), q2_html['question2'].reset_index(drop=True)], axis=1)

# Show top 25 rows
html_examples[['question1', 'question2']].head(25)


In [None]:
# Solution: encode tags : replace them with tokens.
def tag_to_token(text):
    return re.sub(r'</?([a-zA-Z]+)[^>]*>', r' \1tag ', text)

df['q1_html'] = df['question1'].astype(str).apply(tag_to_token)
df['q2_html'] = df['question2'].astype(str).apply(tag_to_token)

# Check results
q1_html = df[df['q1_html'].astype(str).apply(lambda x: bool(html_pattern.search(x)))]
q2_html = df[df['q2_html'].astype(str).apply(lambda x: bool(html_pattern.search(x)))]
print(f"Total questions with HTML tags in question1: {len(q1_html)}")
print(f"Total questions with HTML tags in question2: {len(q2_html)}")
html_examples = pd.concat([q1_html['q1_html'].reset_index(drop=True), q2_html['q2_html'].reset_index(drop=True)], axis=1)
html_examples[['q1_html', 'q2_html']].head(25)

## Step 2: Expanding Contractions and Removing Possessives
Contractions and possessives can introduce inconsistencies in text analysis. This step expands contractions (e.g., "can't" to "cannot") and removes possessive forms.

In [None]:
# Problem: contractions, possessives, apostrophes in general

# Filter rows that contain common contractions
contractions_pattern = r"\b(?:I'm|you're|he's|she's|it's|we're|they're|I've|you've|they've|I'd|you'd|we'd|I'll|you'll|won't|can't|n't|'re|'ve|'ll|'d|'s)\b"
contractions_q1 = df[df['question1'].str.contains(contractions_pattern, case=False, na=False)]
contractions_q2 = df[df['question2'].str.contains(contractions_pattern, case=False, na=False)]

# Display summary
print(f"Total questions with contractions in question1: {len(contractions_q1)}")
print(f"Total questions with contractions in question2: {len(contractions_q2)}")

# Concatenate for side-by-side view (not preserving pairs)
contraction_examples = pd.concat([contractions_q1['question1'].reset_index(drop=True), contractions_q2['question2'].reset_index(drop=True)], axis=1)

# Show top 25 rows
contraction_examples[['question1', 'question2']].head(25)

In [None]:
# Solution: Expand contractions

def expand_contractions(text):
    return contractions.fix(text)

#  on both columns
df['q1_expanded'] = df['question1'].apply(expand_contractions)
df['q2_expanded'] = df['question2'].apply(expand_contractions)

# Optionally remove possesives completely
def remove_possesives(text):
    return re.sub(r"(?i)\b's\b", "", text)

df['q1_expanded'] = df['q1_expanded'].apply(remove_possesives)
df['q2_expanded'] = df['q2_expanded'].apply(remove_possesives)

# Check results
contractions_q1 = df[df['q1_expanded'].str.contains(contractions_pattern, case=False, na=False)]
contractions_q2 = df[df['q2_expanded'].str.contains(contractions_pattern, case=False, na=False)]
print(f"Total questions with contractions in question1: {len(contractions_q1)}")
print(f"Total questions with contractions in question2: {len(contractions_q2)}")
contraction_examples = pd.concat([contractions_q1['q1_expanded'].reset_index(drop=True), contractions_q2['q2_expanded'].reset_index(drop=True)], axis=1)
contraction_examples[['q1_expanded', 'q2_expanded']].head(25)


## Step 3: Normalizing Unicode and Removing Non-ASCII Characters
Text data may contain accented or non-standard Unicode characters, which can cause issues for tokenization and modeling. This step normalizes text to a standard Unicode form and removes diacritics and unusual symbols.

In [None]:
# Problem: Non-ASCII (e.g., accented or unusual symbols)

# Regex to detect characters outside the basic ASCII range
non_ascii_pattern = re.compile(r'[^\x00-\x7F]')

# Mask for each column
q1_unicode = df[df['question1'].astype(str).apply(lambda x: bool(non_ascii_pattern.search(x)))]
q2_unicode = df[df['question2'].astype(str).apply(lambda x: bool(non_ascii_pattern.search(x)))]

# Display summary
print(f"Total questions with Non-ASCII in question1: {len(q1_unicode)}")
print(f"Total questions with Non-ASCII in question2: {len(q2_unicode)}")

# Concatenate for side-by-side view (not preserving pairs)
unicode_examples = pd.concat([q1_unicode['question1'].reset_index(drop=True), q2_unicode['question2'].reset_index(drop=True)], axis=1)

# Show top 25 rows
unicode_examples[['question1', 'question2']].head(25)


In [None]:
# Solution: Normalize characters to follow Unicode

def normalize_unicode(text, remove_accents=True):
    # Normalize to NFKD (decomposed form)
    text = unicodedata.normalize('NFKD', text)

    if remove_accents:
        # Remove diacritics (accents) by dropping combining characters
        text = ''.join([char for char in text if not unicodedata.combining(char)])

    # Re-compose characters (helps keep things standard)
    text = unicodedata.normalize('NFC', text)

    # Replace curly quotes dashes etc.
    text = text.replace('“', '"').replace('”', '"')
    text = text.replace("‘", "'").replace("’", "'")
    text = text.replace("–", "-").replace("—", "-")
    text = re.sub(r'\s+', ' ', text)  # for excess whitespace

    return text.strip()

df['q1_unicode'] = df['question1'].apply(normalize_unicode)
df['q2_unicode'] = df['question2'].apply(normalize_unicode)

# Check results
q1_unicode = df[df['q1_unicode'].astype(str).apply(lambda x: bool(non_ascii_pattern.search(x)))]
q2_unicode = df[df['q2_unicode'].astype(str).apply(lambda x: bool(non_ascii_pattern.search(x)))]
print(f"Total questions with Non-ASCII in question1: {len(q1_unicode)}")
print(f"Total questions with Non-ASCII in question2: {len(q2_unicode)}")
unicode_examples = pd.concat([q1_unicode['q1_unicode'].reset_index(drop=True), q2_unicode['q2_unicode'].reset_index(drop=True)], axis=1)
unicode_examples[['q1_unicode', 'q2_unicode']].head(25)


## Step 4: Handling Currency Symbols
Currency symbols may not be handled well by standard tokenizers. Here we replace common currency symbols with their corresponding acronyms (e.g., $ to USD).

In [None]:
# Problem: Currency symbols

# Define common currency symbols (you can expand this if needed)
currency_symbols = r'[$€£¥₹₩₽₺฿₫₪₴₦]'

# Check for presence of currency symbols
q1_currency = df[df['question1'].astype(str).str.contains(currency_symbols, regex=True)]
q2_currency = df[df['question2'].astype(str).str.contains(currency_symbols, regex=True)]

# Display summary
print(f"Total questions with currency symbols in question1: {len(q1_currency)}")
print(f"Total questions with currency symbols in question2: {len(q2_currency)}")

# Concatenate for side-by-side view (not preserving pairs)
currency_examples = pd.concat([q1_currency['question1'].reset_index(drop=True), q2_currency['question2'].reset_index(drop=True)], axis=1)

# Show top 25 rows
currency_examples[['question1', 'question2']].head(25)

In [None]:
# Solution: replace with acronyms

def convert_currency_symbols(text):
    currency_map = {
        '$': 'USD',
        '€': 'EUR',
        '£': 'GBP',
        '¥': 'JPY',
        '₹': 'INR'
    }
    # Replace each currency symbol with its acronym
    for symbol, code in currency_map.items():
        text = text.replace(symbol, f' {code} ')
    return re.sub(r'\s+', ' ', text).strip()  # Clean up spacing

df['q1_currency'] = df['question1'].apply(convert_currency_symbols)
df['q2_currency'] = df['question2'].apply(convert_currency_symbols)

# Check results
q1_currency = df[df['q1_currency'].astype(str).str.contains(currency_symbols, regex=True)]
q2_currency = df[df['q2_currency'].astype(str).str.contains(currency_symbols, regex=True)]
print(f"Total questions with currency symbols in question1: {len(q1_currency)}")
print(f"Total questions with currency symbols in question2: {len(q2_currency)}")
currency_examples = pd.concat([q1_currency['q1_currency'].reset_index(drop=True), q2_currency['q2_currency'].reset_index(drop=True)], axis=1)
currency_examples[['q1_currency', 'q2_currency']].head(25)

## Step 5: Comprehensive Cleaning Functions
After addressing individual issues, we define comprehensive cleaning functions that combine all previous steps. Multiple cleaning strategies are created (squeaky, light, transformer) to suit different modeling needs, from aggressive cleaning to minimal.

In [None]:
# The usual NLP preprocessing steps

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def strip_whitespace(text):
  return re.sub(r'\s+', ' ', text).strip()

def remove_punctuation(text):
  return re.sub(r'[^\w\s]', ' ', text)

def remove_stopwords(text):
  tokens = text.split()
  tokens = [word for word in tokens if word not in stop_words]
  return ' '.join(tokens)

def stem_words(text):
  tokens = text.split()
  tokens = [stemmer.stem(word) for word in tokens]  
  return ' '.join(tokens)


In [None]:
# Final total and complete cleaning

def squeaky_cleaning(text):
  text = tag_to_token(text)
  text = normalize_unicode(text)
  text = expand_contractions(text)
  text = remove_possesives(text)
  text = convert_currency_symbols(text)
  text = text.lower()
  text = remove_punctuation(text)
  text = remove_stopwords(text)
  text = stem_words(text)
  text = strip_whitespace(text)
  return text

def light_cleaning(text):
  text = tag_to_token(text)
  text = normalize_unicode(text)
  text = expand_contractions(text)
  text = remove_possesives(text)
  text = convert_currency_symbols(text)
  text = text.lower()
  text = remove_punctuation(text)
  text = remove_stopwords(text)
  text = strip_whitespace(text)
  return text

def transformer_cleaning(text):
  text = tag_to_token(text)
  text = normalize_unicode(text)
  text = expand_contractions(text)
  text = convert_currency_symbols(text)
  text = strip_whitespace(text)
  return text

df['question1_squeaky'] = df['question1'].astype(str).progress_apply(squeaky_cleaning)
df['question2_squeaky'] = df['question2'].astype(str).progress_apply(squeaky_cleaning)

df['question1_light'] = df['question1'].astype(str).progress_apply(light_cleaning)
df['question2_light'] = df['question2'].astype(str).progress_apply(light_cleaning)

df['question1_transformer'] = df['question1'].astype(str).progress_apply(transformer_cleaning)
df['question2_transformer'] = df['question2'].astype(str).progress_apply(transformer_cleaning)


Save the cleaned csv to use in the main notebook.

In [None]:
df = df[['question1', 'question2', 'question1_squeaky', 'question2_squeaky', 'question1_light', 'question2_light', 'question1_transformer', 'question2_transformer', 'is_duplicate']]

df.to_csv('quora_cleaned.csv', index=False)

## Step 6: Inspecting Missing and Invalid Data
After cleaning, it is interesting to check for rows with missing, empty, or invalid questions.

In [None]:
def missingness(df, col):
  # Look for 'fake' NaNs: string entries that literally say "nan"
  mask_q1_string_nan = df[col].astype(str).str.lower().eq('nan')

  print(f"'nan' strings in {col}: {mask_q1_string_nan.sum()}")

  # Look for fully empty strings or just whitespace
  mask_q1_empty = df[col].astype(str).str.strip() == ''

  print(f"Empty strings in {col}: {mask_q1_empty.sum()}")

for col in ['question1', 'question2', 'question1_squeaky', 'question2_squeaky', 'question1_light', 'question2_light', 'question1_transformer', 'question2_transformer']:
  missingness(df, col)

In [None]:
mask_q1_valid = ~df['question1'].str.lower().eq('nan')
mask_q2_valid = ~df['question2'].str.lower().eq('nan')

# Keep only rows where both questions are valid
df = df[mask_q1_valid & mask_q2_valid]

In [None]:
empty_q1 = df['question1_squeaky'].astype(str).str.strip() == ''
empty_q2 = df['question2_squeaky'].astype(str).str.strip() == ''

# Combine the two masks
empty_rows = df[empty_q1 | empty_q2]

# Display them
print(f"Total rows with empty cleaned questions: {len(empty_rows)}")
empty_rows[['question1','question1_squeaky', 'question2','question2_squeaky']].head(25)


Another thing to keep in mind is questions that are too short even before cleaning, indicating possible bad quality of the raw dataset.

In [None]:
# Helper: Count words after stripping whitespace
def is_too_short(text):
    if not isinstance(text, str):
        return True
    return len(text.strip().split()) <= 1

# Apply to both columns
short_q1 = df['question1'].apply(is_too_short)
short_q2 = df['question2'].apply(is_too_short)

# Combine masks
short_rows = df[short_q1 | short_q2]

# Display them
print(f"Total rows with very short questions: {len(short_rows)}")
short_rows[['question1', 'question2', 'is_duplicate']]


Please note that those cases identified above were intnentionally kept and not discarded, because we decided that it's worth exploring how our models will handle them.

# Next Steps
The cleaned data is now ready for feature engineering and model development for duplicate question detection.