## Importing Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
import os
sys.path.append('/content/drive/MyDrive/quora_duplicate_questions/src')

import joblib
import numpy as np
import pandas as pd
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from utils import evaluate_model
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Load data

In [4]:
DATA_PATH = '/content/drive/MyDrive/quora_duplicate_questions/data/processed'
df_train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv.zip'), compression='zip')
df_val = pd.read_csv(os.path.join(DATA_PATH, 'val.csv.zip'), compression='zip')

In [5]:
feature_info = joblib.load(os.path.join(DATA_PATH, 'feature_info.joblib'))

In [6]:
numerical_features = feature_info['numerical_features']
text_features = feature_info['text_features']

In [7]:
train_target = df_train['is_duplicate']
val_target = df_val['is_duplicate']

## Prepare text features

In [8]:
def prepare_vectorizer(
        df,
        text_col1, text_col2,
        **vectorizer_params):
    """

    Prepare text features using either CountVectorizer or TfidfVectorizer.
    Args:
        df (pd.DataFrame): DataFrame containing the text data.
        text_col1 (str): Name of the first text column.
        text_col2 (str): Name of the second text column.
        **vectorizer_params: Additional parameters for the vectorizer.
    Returns:
        Vectorizer: Fitted vectorizer instance.
    """
    # Default parameters for vectorizer
    default_params = {
        'max_features': 10000
    }

    default_params.update(vectorizer_params)

    # Combine questions for fitting
    all_questions = pd.concat([df[text_col1], df[text_col2]]).fillna('')

    vectorizer = TfidfVectorizer(**default_params)
    vectorizer.fit(all_questions)

    return vectorizer

In [9]:
def transform(df, text_col1, text_col2, vectorizer):
    """Transform new data using fitted vectorizer"""
    q1_vec = vectorizer.transform(df[text_col1].fillna(''))
    q2_vec = vectorizer.transform(df[text_col2].fillna(''))
    return q1_vec, q2_vec


def combine_features(df, numerical_features, q1_vec, q2_vec):
    """Combine numerical features with vectorized text features"""
    combined_features = hstack([csr_matrix(df[numerical_features]), q1_vec, q2_vec])
    return combined_features

## TF-IDF Vectorizer

In [None]:
vectorizer = prepare_vectorizer(
    df_train,
    text_col1='question1_clean',
    text_col2='question2_clean',
    max_features=10000
)

q1_vec, q2_vec = transform(
    df_train,
    text_col1='question1_clean',
    text_col2='question2_clean',
    vectorizer=vectorizer
)

train_inputs = combine_features(
    df_train,
    numerical_features=numerical_features,
    q1_vec=q1_vec,
    q2_vec=q2_vec
)

In [None]:
q1_vec_val, q2_vec_val = transform(
    df_val,
    text_col1='question1_clean',
    text_col2='question2_clean',
    vectorizer=vectorizer
)

val_inputs = combine_features(
    df_val,
    numerical_features=numerical_features,
    q1_vec=q1_vec_val,
    q2_vec=q2_vec_val
)

### Logistic Regression

In [None]:
model_lr = LogisticRegression(
    random_state=7,
    max_iter=1000,
    class_weight='balanced',
    n_jobs=-1
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_lr
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.452011036119724
Log loss on Validation set: 0.46430953975827727


In [None]:
experiment_results = []
results = {}
results['model'] = 'Logistic Regression'
results['log_loss_train'] = log_loss_train
results['log_loss_val'] = log_loss_val

experiment_results.append(results)

### Random Forest

In [None]:
model_rf = RandomForestClassifier(
    n_estimators=10,
    class_weight='balanced',
    n_jobs=-1
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_rf
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.11558998693332646
Log loss on Validation set: 0.6956181623169458


In [None]:
model_rf = RandomForestClassifier(
    n_estimators=10,
    max_depth=10,
    class_weight='balanced',
    n_jobs=-1
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_rf
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.6293307756923079
Log loss on Validation set: 0.6287038490071538


In [None]:
model_rf = RandomForestClassifier(
    n_estimators=10,
    max_depth=20,
    class_weight='balanced',
    n_jobs=-1
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_rf
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.5904949825153363
Log loss on Validation set: 0.5928795746460779


In [None]:
model_rf = RandomForestClassifier(
    n_estimators=10,
    max_leaf_nodes=1000,
    class_weight='balanced',
    n_jobs=-1
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_rf
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.4880623920738313
Log loss on Validation set: 0.49830950170711463


In [None]:
model_rf = RandomForestClassifier(
    n_estimators=10,
    max_leaf_nodes=1500,
    class_weight='balanced',
    n_jobs=-1
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_rf
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.47754872585050623
Log loss on Validation set: 0.4940950501718493


In [None]:
model_rf = RandomForestClassifier(
    n_estimators=10,
    max_leaf_nodes=1750,
    class_weight='balanced',
    n_jobs=-1
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_rf
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.46324403501189154
Log loss on Validation set: 0.48273726606001555


In [None]:
results = {}
results['model'] = 'Random Forest'
results['log_loss_train'] = log_loss_train
results['log_loss_val'] = log_loss_val

experiment_results.append(results)

In [None]:
pd.DataFrame(experiment_results)

Unnamed: 0,model,log_loss_train,log_loss_val
0,Logistic Regression,0.452011,0.46431
1,Random Forest,0.463244,0.482737


In [None]:
joblib.dump(experiment_results, os.path.join(DATA_PATH, 'experiment_results.joblib'))

['/content/drive/MyDrive/quora_duplicate_questions/data/processed/experiment_results.joblib']

### XGBoost

In [None]:
scale_pos_weight = np.round(sum(train_target == 0) / sum(train_target == 1), 4)
scale_pos_weight

np.float64(1.7086)

In [None]:
model_xgb = XGBClassifier(
    n_estimators = 10,
    learning_rate = 0.1,
    random_state = 7,
    n_jobs = -1,
    eval_metric = 'logloss',
    tree_method = 'hist',
    scale_pos_weight = scale_pos_weight,
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_xgb
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.5578148240183674
Log loss on Validation set: 0.5566084858130264


In [None]:
model_xgb = XGBClassifier(
    n_estimators = 10,
    max_depth=8,
    learning_rate = 0.1,
    random_state = 7,
    n_jobs = -1,
    eval_metric = 'logloss',
    tree_method = 'hist',
    scale_pos_weight = scale_pos_weight,
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_xgb
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.547853552248769
Log loss on Validation set: 0.5476358169741841


In [None]:
model_xgb = XGBClassifier(
    n_estimators = 10,
    max_depth = 10,
    learning_rate = 0.1,
    random_state = 7,
    n_jobs = -1,
    eval_metric = 'logloss',
    tree_method = 'hist',
    scale_pos_weight = scale_pos_weight,
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_xgb
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.5380278210416257
Log loss on Validation set: 0.5396793278246858


In [None]:
results = {}
results['model'] = 'XGBClassifier'
results['log_loss_train'] = log_loss_train
results['log_loss_val'] = log_loss_val

experiment_results.append(results)

In [None]:
joblib.dump(experiment_results, os.path.join(DATA_PATH, 'experiment_results.joblib'))

['/content/drive/MyDrive/quora_duplicate_questions/data/processed/experiment_results.joblib']

### LightGBM

In [None]:
model_lgb = LGBMClassifier(
    learning_rate = 0.1,
    n_estimators = 10,
    random_state = 7,
    n_jobs = -1,
    is_unbalance=True,
    verbose=0,
    force_row_wise=True,
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_lgb
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.5308825289028586
Log loss on Validation set: 0.5291303549941574


In [None]:
model_lgb = LGBMClassifier(
    num_leaves=150,
    learning_rate = 0.1,
    n_estimators = 10,
    random_state = 7,
    n_jobs = -1,
    is_unbalance=True,
    verbose=0,
    force_row_wise=True,
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_lgb
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.5082868855534716
Log loss on Validation set: 0.5089609529088308


In [None]:
model_lgb = LGBMClassifier(
    num_leaves=500,
    learning_rate = 0.1,
    n_estimators = 10,
    random_state = 7,
    n_jobs = -1,
    is_unbalance=True,
    verbose=0,
    force_row_wise=True,
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_lgb
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.4855083525869623
Log loss on Validation set: 0.4935436375046539


In [None]:
model_lgb = LGBMClassifier(
    num_leaves=750,
    learning_rate=0.1,
    n_estimators=10,
    random_state=7,
    n_jobs=-1,
    is_unbalance=True,
    verbose=0,
    force_row_wise=True,
).fit(train_inputs, train_target)

log_loss_train, log_loss_val = evaluate_model(
    train_inputs, val_inputs, train_target, val_target, model_lgb
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.4757919936569115
Log loss on Validation set: 0.4888107138178132


In [None]:
experiment_results = joblib.load(os.path.join(DATA_PATH, 'experiment_results.joblib'))
experiment_results

[{'model': 'Logistic Regression',
  'log_loss_train': 0.452011036119724,
  'log_loss_val': 0.46430953975827727},
 {'model': 'Random Forest',
  'log_loss_train': 0.46324403501189154,
  'log_loss_val': 0.48273726606001555},
 {'model': 'XGBClassifier',
  'log_loss_train': 0.5380278210416257,
  'log_loss_val': 0.5396793278246858}]

In [None]:
results = {}
results['model'] = 'LGBMClassifier'
results['log_loss_train'] = log_loss_train
results['log_loss_val'] = log_loss_val

experiment_results.append(results)

In [None]:
joblib.dump(experiment_results, os.path.join(DATA_PATH, 'experiment_results.joblib'))

['/content/drive/MyDrive/quora_duplicate_questions/data/processed/experiment_results.joblib']

In [None]:
pd.DataFrame(experiment_results)

Unnamed: 0,model,log_loss_train,log_loss_val
0,Logistic Regression,0.452011,0.46431
1,Random Forest,0.463244,0.482737
2,XGBClassifier,0.538028,0.539679
3,LGBMClassifier,0.475792,0.488811


The best result for Logistic Regression. Tuning hyperparameters for LGBMClassifier maybe give better performance.

## TF-IDF for matching words

In [10]:
def get_match_words(q1, q2):
    """
    Function to find the common words between two questions
    Args:
        q1: first question
        q2: second question
    Return:
        common words, match ratio
    """
    try:
      q1 = set(q1.split())
      q2 = set(q2.split())
    except:
      q1 = set()
      q2 = set()
    match_words_list =  list(q1.intersection(q2))
    count_match_words = len(match_words_list)
    if count_match_words == 0:
        match_words = ''
    else:
        match_words = ' '.join(match_words_list)

    return match_words, count_match_words

In [11]:
def train_evaluate_model(X_train, X_val, y_train, y_val, features, vectorizer, model):
    """
    Function to train and evaluate the model
    Args:
        X_train: training data
        X_val: validation data
        y_train: training labels
        y_val: validation labels
        vectorizer: vectorizer
        model: model
    Return:
        log loss on the training and validation data
    """
    vectorizer.fit(X_train[features])

    # Transform the data
    X_train = vectorizer.transform(X_train[features])
    X_val = vectorizer.transform(X_val[features])

    # Train the model
    model.fit(X_train, y_train)

    # # Calculate log loss
    log_loss_train, log_loss_val = evaluate_model(X_train, X_val, y_train, y_val, model)

    return log_loss_train, log_loss_val

In [12]:
X_train = pd.DataFrame()
X_val = pd.DataFrame()
X_train['match_words'] = df_train.apply(
    lambda x: get_match_words(x['question1_clean'], x['question2_clean'])[0], axis=1)
X_val['match_words']  = df_val.apply(
    lambda x: get_match_words(x['question1_clean'], x['question2_clean'])[0], axis=1)
y_train = df_train['is_duplicate']
y_val = df_val['is_duplicate']

In [13]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    lowercase=True,
    analyzer='word',
    max_features=10000
)
model = LogisticRegression(
    max_iter=1000,
    random_state=7,
    class_weight='balanced',
    n_jobs=-1
)

log_loss_train, log_loss_val = train_evaluate_model(
    X_train, X_val, y_train, y_val, 'match_words', vectorizer, model
)
print('Log loss on Train set:', log_loss_train)
print('Log loss on Validation set:', log_loss_val)

Log loss on Train set: 0.517737791188619
Log loss on Validation set: 0.5385623392721174


TF-IDF only on matching words performs worse than TF-IDF on all words.

In [15]:
experiment_results = joblib.load(os.path.join(DATA_PATH, 'experiment_results.joblib'))
results = {}
results['model'] = 'Logistic Regressin with TF-IDF on matching words'
results['log_loss_train'] = np.round(log_loss_train, 5)
results['log_loss_val'] = np.round(log_loss_val, 5)

experiment_results.append(results)

joblib.dump(experiment_results, os.path.join(DATA_PATH, 'experiment_results.joblib'))

['/content/drive/MyDrive/quora_duplicate_questions/data/processed/experiment_results.joblib']