# Data Pre- Processing for NLP

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
pd.set_option("display.max_columns",None)

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

import time # Import the time module

import warnings
warnings.filterwarnings('default')
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)


In [2]:
data = pd.read_csv('EDA_filtered_Rating_Amazon_data.csv')

In [3]:
# Step1 defining features and target

# Define the target variable (y) and features (X)
# The `Rating_Sentiment` column appears to be the target variable based on the data structure.
# Features will be all numerical columns from 'Review_str_len' to the end.
#X = data.loc[:, 'Review_str_len':'years']
# Drop non-numeric columns except target/label
X = data.drop(['Rating_Sentiment'], axis=1)
y = data['Rating_Sentiment']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (1800000, 15)
Target shape: (1800000,)


In [4]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"Original classes: {le.classes_}")
print(f"Encoded labels: {np.unique(y_encoded)}\n")

Original classes: ['Negative' 'Neutral' 'Positive']
Encoded labels: [0 1 2]



In [5]:
#Step 2: Split the data into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y)

print(f"Training features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Validation target shape: {y_val.shape}")

Training features shape: (1260000, 15)
Validation features shape: (540000, 15)
Training target shape: (1260000,)
Validation target shape: (540000,)


In [6]:
text_features = 'WordNet_Lemmatizer'
numerical_features = ['Review_str_len', 'Title_str_len', 'Review_wtoken_cnt', 'lexical_diversity', 'review_removed_cnt']

In [19]:
# --- 2. Define the shared preprocessors and pre-compute data ---

# Preprocessor for all models except Naive Bayes
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500, stop_words='english'), text_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='drop'
)

# Preprocessor specifically for Naive Bayes to handle non-negative inputs
preprocessor_nb = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500, stop_words='english'), text_features),
        ('num', MinMaxScaler(), numerical_features) # Using MinMaxScaler to prevent negative values
    ],
    remainder='drop'
)

# --- 2.1 Pre-compute the transformed data ONCE ---

print("Pre-computing data transformations...")

# Data for Implementation 1: TF-IDF + StandardScaler
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Data for Implementation 2 & 3: TF-IDF + StandardScaler + PCA
# We must first fit the preprocessor, then the PCA on the preprocessed data
pipeline_PCA = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=83)),
])
X_train_pca = pipeline_PCA.fit_transform(X_train, y_train)
X_val_pca = pipeline_PCA.transform(X_val)

# Data for Naive Bayes models (pre-computed separately to handle non-negativity)
X_train_nb = preprocessor_nb.fit_transform(X_train)
X_val_nb = preprocessor_nb.transform(X_val)

Pre-computing data transformations...


In [20]:
# --- 3. Create a dictionary of models to experiment with ---
models = {
    'LogisticRegression': LogisticRegression(),
    'XGBoost': XGBClassifier(),
    'RandomForest': RandomForestClassifier(),
    #'KNN': KNeighborsClassifier(),
    #'NaiveBayes': MultinomialNB()
}

# --- 4. Define parameter grids for each model ---
param_grids = {
   'LogisticRegression': {
        'model__C': [0.1, 1.0, 10.0],
        'model__solver': ['liblinear', 'lbfgs']
    },
    'XGBoost': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1, 0.2]
    },
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [10, 20, None]
    },
    'KNN': {
        'model__n_neighbors': [3, 5, 7],
        'model__weights': ['uniform', 'distance']
    },
    'NaiveBayes': {
        'model__alpha': [0.1, 0.5, 1.0]
    }
}

# Implementation 1: Pipeline(TF-IDF + StandardScaler -> Model)

In [17]:
# Implementation is split to have LogisticRegression, XGBoost, RandomForst and KNN

In [None]:

# --- 1. Basic Implementations ---

print("\n--- Implementation 1: Pipeline(TF-IDF + StandardScaler -> Model) ---")
for model_name, model_instance in models.items():
    print(f"\nTraining and evaluating the {model_name} model...")
    try:
        if model_name == 'NaiveBayes':
            model_instance.fit(X_train_nb, y_train)
            score = model_instance.score(X_val_nb, y_val)
        else:
            model_instance.fit(X_train_transformed, y_train)
            score = model_instance.score(X_val_transformed, y_val)
        
        print(f"Accuracy for {model_name}: {score:.4f}")
    except Exception as e:
        print(f"An error occurred while training {model_name}: {e}")







In [43]:

# --- 5. Implementations ---

print("\n--- Implementation 1: Pipeline(TF-IDF + StandardScaler -> Model) ---")
for model_name, model_instance in models.items():
    print(f"\nTraining and evaluating the {model_name} model...")
    try:
        if model_name == 'NaiveBayes':
            model_instance.fit(X_train_nb, y_train)
            score = model_instance.score(X_val_nb, y_val)
        else:
            model_instance.fit(X_train_transformed, y_train)
            score = model_instance.score(X_val_transformed, y_val)
        
        print(f"Accuracy for {model_name}: {score:.4f}")
    except Exception as e:
        print(f"An error occurred while training {model_name}: {e}")








--- Implementation 1: Pipeline(TF-IDF + StandardScaler -> Model) ---

Training and evaluating the  'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'RandomForest': RandomForestClassifier(random_state=42), KNN model...
Accuracy for  'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'RandomForest': RandomForestClassifier(random_state=42), KNN: 0.4560

Training and evaluating the NaiveBayes model...
Accuracy for NaiveBayes: 0.6347


# Implementation 2: Pipeline(TF-IDF + StandardScaler -> PCA -> Model)

In [23]:
print("\n--- Implementation 2: Pipeline(TF-IDF + StandardScaler -> PCA -> Model) ---")
for model_name, model_instance in models.items():
    print(f"\nTraining and evaluating the {model_name} model...")
    try:
        if model_name == 'NaiveBayes':
            # Naive Bayes and PCA are incompatible because PCA produces negative values
            print(f"Skipping {model_name} as it is incompatible with PCA.")
            continue
        
        model_instance.fit(X_train_pca, y_train)
        score = model_instance.score(X_val_pca, y_val)
        
        print(f"Accuracy for {model_name}: {score:.4f}")
    except Exception as e:
        print(f"An error occurred while training {model_name}: {e}")


--- Implementation 2: Pipeline(TF-IDF + StandardScaler -> PCA -> Model) ---

Training and evaluating the LogisticRegression model...
Accuracy for LogisticRegression: 0.6112

Training and evaluating the XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy for XGBoost: 0.6220

Training and evaluating the RandomForest model...
Accuracy for RandomForest: 0.5974

Training and evaluating the KNN model...
Accuracy for KNN: 0.4680

Training and evaluating the NaiveBayes model...
Skipping NaiveBayes as it is incompatible with PCA.


# Implementation 3: Pipeline(TF-IDF + StandardScaler -> PCA -> Model) with GridSearchCV

In [14]:
print("\n--- Implementation 3: Pipeline(TF-IDF + StandardScaler -> PCA -> Model) with GridSearchCV ---")
best_models = {}

# The PCA pipeline is defined outside the loop
simplified_pca_pipeline = Pipeline([
    ('pca', PCA(n_components=83)),
])

for model_name, model_instance in models.items():
    print(f"\nPerforming GridSearchCV for the {model_name} model...")
    
    param_grid = param_grids.get(model_name, {})
    if not param_grid:
        print(f"No parameter grid defined for {model_name}. Skipping...")
        continue

    # Create the full pipeline for GridSearchCV
    # The preprocessor is no longer in the pipeline! It's pre-computed.
    full_pipeline_gs = Pipeline([
        ('model', model_instance)
    ])

    try:
        # Check for NaiveBayes and handle its incompatibility with PCA
        if model_name == 'NaiveBayes':
            print(f"Skipping GridSearchCV for {model_name} as it is incompatible with PCA.")
            continue

        grid_search = GridSearchCV(
            full_pipeline_gs, 
            param_grid, 
            cv=3, 
            scoring='accuracy', 
            n_jobs=4,  # Use a specific number of jobs to prevent memory overload
            verbose=1
        )
        
        # Fit GridSearchCV on the pre-computed PCA data
        grid_search.fit(X_train_pca, y_train)
        
        best_models[model_name] = grid_search.best_estimator_
        
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        best_score = grid_search.score(X_val_pca, y_val)
        print(f"Validation Accuracy for the best {model_name} model: {best_score:.4f}")
        
    except Exception as e:
        print(f"An error occurred while running GridSearchCV for {model_name}: {e}")


--- Implementation 3: Pipeline(TF-IDF + StandardScaler -> PCA -> Model) with GridSearchCV ---

Performing GridSearchCV for the LogisticRegression model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarr

Best parameters for LogisticRegression: {'model__C': 1.0, 'model__solver': 'lbfgs'}
Validation Accuracy for the best LogisticRegression model: 0.6112

Performing GridSearchCV for the XGBoost model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters for XGBoost: {'model__learning_rate': 0.2, 'model__n_estimators': 200}
Validation Accuracy for the best XGBoost model: 0.6267

Performing GridSearchCV for the RandomForest model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




Best parameters for RandomForest: {'model__max_depth': None, 'model__n_estimators': 200}
Validation Accuracy for the best RandomForest model: 0.6018

Performing GridSearchCV for the KNN model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters for KNN: {'model__n_neighbors': 7, 'model__weights': 'distance'}
Validation Accuracy for the best KNN model: 0.4844

Performing GridSearchCV for the NaiveBayes model...
Skipping GridSearchCV for NaiveBayes as it is incompatible with PCA.


# Implementation 4: Pipeline(TF-IDF + StandardScaler -> Model) with GridSearchCV

In [None]:
print("\n--- Implementation 4: Pipeline(TF-IDF + StandardScaler -> Model) with GridSearchCV ---")
best_models = {}


for model_name, model_instance in models.items():
    print(f"\nPerforming GridSearchCV for the {model_name} model...")
    
    param_grid = param_grids.get(model_name, {})
    if not param_grid:
        print(f"No parameter grid defined for {model_name}. Skipping...")
        continue

    # Create the full pipeline for GridSearchCV
    # The preprocessor is no longer in the pipeline! It's pre-computed.
    full_pipeline_gs = Pipeline([
        ('model', model_instance)
    ])

    try:
        # Check for NaiveBayes and handle its incompatibility with PCA
        if model_name == 'NaiveBayes':
            print(f"Skipping GridSearchCV for {model_name} as it is incompatible with PCA.")
            continue

        grid_search = GridSearchCV(
            full_pipeline_gs, 
            param_grid, 
            cv=3, 
            scoring='accuracy', 
            n_jobs=4,  # Use a specific number of jobs to prevent memory overload
            verbose=1
        )
        
        # Fit GridSearchCV on the pre-computed PCA data
        grid_search.fit(X_train_nb, y_train)
        
        best_models[model_name] = grid_search.best_estimator_
        
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        best_score = grid_search.score(X_val_nb, y_val)
        print(f"Validation Accuracy for the best {model_name} model: {best_score:.4f}")
        
    except Exception as e:
        print(f"An error occurred while running GridSearchCV for {model_name}: {e}")


--- Implementation 4: Pipeline(TF-IDF + StandardScaler -> Model) with GridSearchCV ---

Performing GridSearchCV for the LogisticRegression model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters for LogisticRegression: {'model__C': 0.1, 'model__solver': 'lbfgs'}
Validation Accuracy for the best LogisticRegression model: 0.6613

Performing GridSearchCV for the XGBoost model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




Best parameters for XGBoost: {'model__learning_rate': 0.2, 'model__n_estimators': 200}
Validation Accuracy for the best XGBoost model: 0.6654

Performing GridSearchCV for the RandomForest model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




In [16]:
# Implementation is split to have RandomForst and KNN

In [9]:
print("\n--- Implementation 4: Pipeline(TF-IDF + StandardScaler -> Model) with GridSearchCV ---")
best_models = {}


for model_name, model_instance in models.items():
    print(f"\nPerforming GridSearchCV for the {model_name} model...")
    
    param_grid = param_grids.get(model_name, {})
    if not param_grid:
        print(f"No parameter grid defined for {model_name}. Skipping...")
        continue

    # Create the full pipeline for GridSearchCV
    # The preprocessor is no longer in the pipeline! It's pre-computed.
    full_pipeline_gs = Pipeline([
        ('model', model_instance)
    ])

    try:
        # Check for NaiveBayes and handle its incompatibility with PCA
        if model_name == 'NaiveBayes':
            print(f"Skipping GridSearchCV for {model_name} as it is incompatible with PCA.")
            continue

        grid_search = GridSearchCV(
            full_pipeline_gs, 
            param_grid, 
            cv=3, 
            scoring='accuracy', 
            n_jobs=4,  # Use a specific number of jobs to prevent memory overload
            verbose=1
        )
        
        # Fit GridSearchCV on the pre-computed PCA data
        grid_search.fit(X_train_nb, y_train)
        
        best_models[model_name] = grid_search.best_estimator_
        
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        best_score = grid_search.score(X_val_nb, y_val)
        print(f"Validation Accuracy for the best {model_name} model: {best_score:.4f}")
        
    except Exception as e:
        print(f"An error occurred while running GridSearchCV for {model_name}: {e}")


--- Implementation 5: Pipeline(TF-IDF Bigram and Trigram + StandardScaler -> Model) with GridSearchCV ---

Performing GridSearchCV for the RandomForest model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




Best parameters for RandomForest: {'model__max_depth': None, 'model__n_estimators': 200}
Validation Accuracy for the best RandomForest model: 0.6535

Performing GridSearchCV for the KNN model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




Best parameters for KNN: {'model__n_neighbors': 7, 'model__weights': 'distance'}
Validation Accuracy for the best KNN model: 0.4952


# Implmenetation with Bigram and Trigram

In [7]:
# --- 2. Define the shared preprocessors and pre-compute data ---

# Preprocessor for all models except Naive Bayes
preprocessor = ColumnTransformer(
    transformers=[
        # Updated TfidfVectorizer to use bigrams and trigrams
        ('text', TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(2, 3)), text_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='drop'
)

# Preprocessor specifically for Naive Bayes to handle non-negative inputs
preprocessor_nb = ColumnTransformer(
    transformers=[
        # Updated TfidfVectorizer to use bigrams and trigrams
        ('text', TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(2, 3)), text_features),
        ('num', MinMaxScaler(), numerical_features) # Using MinMaxScaler to prevent negative values
    ],
    remainder='drop'
)

# --- 2.1 Pre-compute the transformed data ONCE ---

print("Pre-computing data transformations...")

# Data for Implementation 1: TF-IDF + StandardScaler
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Data for Implementation 2 & 3: TF-IDF + StandardScaler + PCA
# We must first fit the preprocessor, then the PCA on the preprocessed data
pipeline_PCA = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=83)),
])
X_train_pca = pipeline_PCA.fit_transform(X_train, y_train)
X_val_pca = pipeline_PCA.transform(X_val)

# Data for Naive Bayes models (pre-computed separately to handle non-negativity)
X_train_nb = preprocessor_nb.fit_transform(X_train)
X_val_nb = preprocessor_nb.transform(X_val)

Pre-computing data transformations...


In [8]:
# --- 3. Create a dictionary of models to experiment with ---
models = {
    'RandomForest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier()
}

# --- 4. Define parameter grids for each model ---
param_grids = {
    'LogisticRegression': {
        'model__C': [0.1, 1.0, 10.0],
        'model__solver': ['liblinear', 'lbfgs']
    },
    'XGBoost': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1, 0.2]
    },
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [10, 20, None]
    },
    'KNN': {
        'model__n_neighbors': [3, 5, 7],
        'model__weights': ['uniform', 'distance']
    },
    'NaiveBayes': {
        'model__alpha': [0.1, 0.5, 1.0]
    }
}

In [None]:
print("\n--- Implementation 5: Pipeline(TF-IDF Bigram and Trigram + StandardScaler -> Model) with GridSearchCV ---")
best_models = {}


for model_name, model_instance in models.items():
    print(f"\nPerforming GridSearchCV for the {model_name} model...")
    
    param_grid = param_grids.get(model_name, {})
    if not param_grid:
        print(f"No parameter grid defined for {model_name}. Skipping...")
        continue

    # Create the full pipeline for GridSearchCV
    # The preprocessor is no longer in the pipeline! It's pre-computed.
    full_pipeline_gs = Pipeline([
        ('model', model_instance)
    ])

    try:
        # Check for NaiveBayes and handle its incompatibility with PCA
        if model_name == 'NaiveBayes':
            print(f"Skipping GridSearchCV for {model_name} as it is incompatible with PCA.")
            continue

        grid_search = GridSearchCV(
            full_pipeline_gs, 
            param_grid, 
            cv=3, 
            scoring='accuracy', 
            n_jobs=4,  # Use a specific number of jobs to prevent memory overload
            verbose=1
        )
        
        # Fit GridSearchCV on the pre-computed PCA data
        grid_search.fit(X_train_nb, y_train)
        
        best_models[model_name] = grid_search.best_estimator_
        
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        best_score = grid_search.score(X_val_nb, y_val)
        print(f"Validation Accuracy for the best {model_name} model: {best_score:.4f}")
        
    except Exception as e:
        print(f"An error occurred while running GridSearchCV for {model_name}: {e}")


--- Implementation 5: Pipeline(TF-IDF Bigram and Trigram + StandardScaler -> Model) with GridSearchCV ---

Performing GridSearchCV for the LogisticRegression model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best parameters for LogisticRegression: {'model__C': 0.1, 'model__solver': 'lbfgs'}
Validation Accuracy for the best LogisticRegression model: 0.5181

Performing GridSearchCV for the XGBoost model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




Best parameters for XGBoost: {'model__learning_rate': 0.2, 'model__n_estimators': 200}
Validation Accuracy for the best XGBoost model: 0.5135

Performing GridSearchCV for the RandomForest model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




In [15]:
# Implementation is split to have RandomForst and KNN as it took longer time in local

In [9]:
print("\n--- Implementation 5: Pipeline(TF-IDF Bigram and Trigram + StandardScaler -> Model) with GridSearchCV ---")
best_models = {}


for model_name, model_instance in models.items():
    print(f"\nPerforming GridSearchCV for the {model_name} model...")
    
    param_grid = param_grids.get(model_name, {})
    if not param_grid:
        print(f"No parameter grid defined for {model_name}. Skipping...")
        continue

    # Create the full pipeline for GridSearchCV
    # The preprocessor is no longer in the pipeline! It's pre-computed.
    full_pipeline_gs = Pipeline([
        ('model', model_instance)
    ])

    try:
        # Check for NaiveBayes and handle its incompatibility with PCA
        if model_name == 'NaiveBayes':
            print(f"Skipping GridSearchCV for {model_name} as it is incompatible with PCA.")
            continue

        grid_search = GridSearchCV(
            full_pipeline_gs, 
            param_grid, 
            cv=3, 
            scoring='accuracy', 
            n_jobs=4,  # Use a specific number of jobs to prevent memory overload
            verbose=1
        )
        
        # Fit GridSearchCV on the pre-computed PCA data
        grid_search.fit(X_train_nb, y_train)
        
        best_models[model_name] = grid_search.best_estimator_
        
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        best_score = grid_search.score(X_val_nb, y_val)
        print(f"Validation Accuracy for the best {model_name} model: {best_score:.4f}")
        
    except Exception as e:
        print(f"An error occurred while running GridSearchCV for {model_name}: {e}")


--- Implementation 5: Pipeline(TF-IDF Bigram and Trigram + StandardScaler -> Model) with GridSearchCV ---

Performing GridSearchCV for the RandomForest model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters for RandomForest: {'model__max_depth': None, 'model__n_estimators': 200}
Validation Accuracy for the best RandomForest model: 0.4926

Performing GridSearchCV for the KNN model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters for KNN: {'model__n_neighbors': 7, 'model__weights': 'uniform'}
Validation Accuracy for the best KNN model: 0.4730


In [10]:

print("\n--- All Implementations Complete ---")


--- All Implementations Complete ---
