# Train Models Notebook

In this notebook, we train the models and store the results in the disc.

In [1]:
import os
import joblib
import pandas as pd
from tqdm import tqdm
import multiprocessing
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from utils import TextPreprocessor, FeatureGenerator, remove_nan_questions, get_param_grid

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\35796\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\35796\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# reading in the data and setting the seed
_path_folder_quora = "Datasets/QuoraQuestionPairs"
MODELS_DIR = "models"
SEED = 120

In [3]:
# only creating folder if it does not already exist
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)
    print(f"Folder '{MODELS_DIR}' created successfully.")
else:
    print(f"Folder '{MODELS_DIR}' already exists.")

Folder 'models' already exists.


We first load the data and then split it into features and labels.

In [4]:
train_df = pd.read_csv(os.path.join(_path_folder_quora, "quora_train_data.csv"))

x_train = train_df.loc[:, ["question1", "question2"]]
y_train = train_df.loc[:, "is_duplicate"]

x_train, y_train = remove_nan_questions(x_train, y_train)

x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.05, random_state=SEED)
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.05, random_state=SEED)

## Models

### Simple Approch

Here we do some basic preprocessing, and we generate features with CountVectorizer and Horizontal stacking. We then use logistic regression as our model.

In [5]:
pipe = Pipeline([('preprocessor', TextPreprocessor(to_lower=True)),
                    ('generator', FeatureGenerator(exts=('cv', ), aggs=('stack', ), extra_features=tuple())),
                    ('classifier', LogisticRegression(max_iter=1000, solver="liblinear",random_state=SEED))],
                    verbose=True)

pipe.fit(x_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   0.6s
[Pipeline] ......... (step 2 of 3) Processing generator, total=  20.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  55.8s


In [7]:
# saving the model
joblib.dump(pipe, os.path.join(MODELS_DIR, f"simple_approach.pk1"))

['models\\simple_approach.pk1']

### Improved approach

In the enhanced approach, we'll delineate two primary routes:

- Manual Compilation: Clearly define the preprocessing, feature generation, and classifier steps for fitting.
- Grid Search: Experiment with various parameter combinations.

In [8]:
GRID_SEARCH: bool = False  # True

In [9]:
# defining the models to be included in the grid search
if GRID_SEARCH:
    models = {
            "AdaBoostClassifier": AdaBoostClassifier(),
            "RandomForestClassifier": RandomForestClassifier(),
            "LogisticRegression": LogisticRegression(max_iter=1000, random_state = SEED),
            "BernoulliNB": BernoulliNB(),
            "GaussianNB": GaussianNB(),
            "KNeighborsClassifier": KNeighborsClassifier(),
            "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
            "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
            "SVC": SVC(),
            "GradientBoostingClassifier": GradientBoostingClassifier()
            }

Start with the manual compilation: 

In [10]:
if not GRID_SEARCH:
    models = {"LogisticRegression": LogisticRegression(max_iter=1000, random_state = SEED)}

In [11]:
# using count vectorizer and tf-idf with stack and absolute, logistic regression model
if not GRID_SEARCH:
    pipe = Pipeline(
            [('preprocessor', TextPreprocessor(
                 remove_stop_words = True,
                 remove_punctuation = True,
                 to_lower = True,
                 apply_stemming = True,
                 british = False)),
             ('generator', FeatureGenerator(exts=('cv_2w', 'tf_idf_2w'), aggs=('stack', 'absolute'))),
             ('classifier', models['LogisticRegression'])], verbose=True)
    
    pipe.fit(x_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total= 9.2min
[Pipeline] ......... (step 2 of 3) Processing generator, total= 2.4min
[Pipeline] ........ (step 3 of 3) Processing classifier, total=13.3min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
# saving the model
if not GRID_SEARCH:
    joblib.dump(pipe, f'{MODELS_DIR}/improved_approach.joblib')

Here is the Grid search:

In [13]:
if GRID_SEARCH:
    fitted_models = {}
    scores = {}
    for name, model in tqdm(models.items()):
        # define pipeline given a model
        pipe = Pipeline([('preprocessor', TextPreprocessor()),
                            ('generator', FeatureGenerator()),
                            ('classifier', model)],verbose=True)
        # get grid of parameters to search
        grid = get_param_grid(name, SEED)
        grid_search = GridSearchCV(
            pipe,
            param_grid=grid,
            scoring= "roc_auc",
            cv=2,
            verbose=10,
            n_jobs=multiprocessing.cpu_count() - 1,
            error_score="raise",
        )

        # fit grid search with pipeline and grid
        grid_search.fit(x_train, y_train)

        # save model
        fitted_models[name] = grid_search.best_estimator_
        scores[name] = grid_search.best_score_

        joblib.dump(grid_search, os.path.join(MODELS_DIR, f"fitted_{name}.pk1"))

Finally, we saved the best model to use in the reproduce results.

In [14]:
if GRID_SEARCH:
    best_model_name = max(scores)
    best_model = fitted_models[best_model_name]
    print(f"Best model found in the grid search is {best_model_name}, with a CV score of {scores[best_model_name]:.4f}")
    fitted_pipe = best_model
    joblib.dump(fitted_pipe, f'{MODELS_DIR}/improved_solution.joblib')