# Data Pre- Processing for NLP

In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
pd.set_option("display.max_columns",None)

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

import time # Import the time module

import warnings
warnings.filterwarnings('default')
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)


In [2]:
data = pd.read_csv('EDA_filtered_Rating_Amazon_data.csv')

In [3]:
# Step1 defining features and target

# Define the target variable (y) and features (X)
# The `Rating_Sentiment` column appears to be the target variable based on the data structure.
# Features will be all numerical columns from 'Review_str_len' to the end.
#X = data.loc[:, 'Review_str_len':'years']
# Drop non-numeric columns except target/label
X = data.drop(['Rating_Sentiment'], axis=1)
y = data['Rating_Sentiment']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

Features shape: (1800000, 15)
Target shape: (1800000,)


In [4]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)
print(f"Original classes: {le.classes_}")
print(f"Encoded labels: {np.unique(y_encoded)}\n")

Original classes: ['Negative' 'Neutral' 'Positive']
Encoded labels: [0 1 2]



In [6]:
#Step 2: Split the data into training and validation sets

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y)

print(f"Training features shape: {X_train.shape}")
print(f"Validation features shape: {X_val.shape}")
print(f"Training target shape: {y_train.shape}")
print(f"Validation target shape: {y_val.shape}")

Training features shape: (1260000, 15)
Validation features shape: (540000, 15)
Training target shape: (1260000,)
Validation target shape: (540000,)


In [10]:
text_features = 'WordNet_Lemmatizer'
numerical_features = ['Review_str_len', 'Title_str_len', 'Review_wtoken_cnt', 'lexical_diversity', 'review_removed_cnt']

In [11]:
# --- 2. Define the shared preprocessors and pre-compute data ---

# Preprocessor for all models except Naive Bayes
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500, stop_words='english'), text_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='drop'
)

# Preprocessor specifically for Naive Bayes to handle non-negative inputs
preprocessor_nb = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500, stop_words='english'), text_features),
        ('num', MinMaxScaler(), numerical_features) # Using MinMaxScaler to prevent negative values
    ],
    remainder='drop'
)

# --- 2.1 Pre-compute the transformed data ONCE ---

print("Pre-computing data transformations...")

# Data for Implementation 1: TF-IDF + StandardScaler
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Data for Implementation 2 & 3: TF-IDF + StandardScaler + PCA
# We must first fit the preprocessor, then the PCA on the preprocessed data
pipeline_PCA = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=83)),
])
X_train_pca = pipeline_PCA.fit_transform(X_train, y_train)
X_val_pca = pipeline_PCA.transform(X_val)

# Data for Naive Bayes models (pre-computed separately to handle non-negativity)
X_train_nb = preprocessor_nb.fit_transform(X_train)
X_val_nb = preprocessor_nb.transform(X_val)

Pre-computing data transformations...


In [14]:
# --- 3. Create a dictionary of models to experiment with ---
models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'RandomForest': RandomForestClassifier(random_state=42),
    'KNN': KNeighborsClassifier(),
    'NaiveBayes': MultinomialNB()
}

# --- 4. Define parameter grids for each model ---
param_grids = {
    'LogisticRegression': {
        'model__C': [0.1, 1.0, 10.0],
        'model__solver': ['liblinear', 'lbfgs']
    },
    'XGBoost': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1, 0.2]
    },
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [10, 20, None]
    },
    'KNN': {
        'model__n_neighbors': [3, 5, 7],
        'model__weights': ['uniform', 'distance']
    },
    'NaiveBayes': {
        'model__alpha': [0.1, 0.5, 1.0]
    }
}

In [None]:

# --- 5. Implementations ---

print("\n--- Implementation 1: Pipeline(TF-IDF + StandardScaler -> Model) ---")
for model_name, model_instance in models.items():
    print(f"\nTraining and evaluating the {model_name} model...")
    try:
        if model_name == 'NaiveBayes':
            model_instance.fit(X_train_nb, y_train)
            score = model_instance.score(X_val_nb, y_val)
        else:
            model_instance.fit(X_train_transformed, y_train)
            score = model_instance.score(X_val_transformed, y_val)
        
        print(f"Accuracy for {model_name}: {score:.4f}")
    except Exception as e:
        print(f"An error occurred while training {model_name}: {e}")







In [None]:
print("\n--- Implementation 2: Pipeline(TF-IDF + StandardScaler -> PCA -> Model) ---")
for model_name, model_instance in models.items():
    print(f"\nTraining and evaluating the {model_name} model...")
    try:
        if model_name == 'NaiveBayes':
            # Naive Bayes and PCA are incompatible because PCA produces negative values
            print(f"Skipping {model_name} as it is incompatible with PCA.")
            continue
        
        model_instance.fit(X_train_pca, y_train)
        score = model_instance.score(X_val_pca, y_val)
        
        print(f"Accuracy for {model_name}: {score:.4f}")
    except Exception as e:
        print(f"An error occurred while training {model_name}: {e}")

In [None]:
print("\n--- Implementation 3: Pipeline(TF-IDF + StandardScaler -> PCA -> Model) with GridSearchCV ---")
best_models = {}

# The PCA pipeline is defined outside the loop
simplified_pca_pipeline = Pipeline([
    ('pca', PCA(n_components=83)),
])

for model_name, model_instance in models.items():
    print(f"\nPerforming GridSearchCV for the {model_name} model...")
    
    param_grid = param_grids.get(model_name, {})
    if not param_grid:
        print(f"No parameter grid defined for {model_name}. Skipping...")
        continue

    # Create the full pipeline for GridSearchCV
    # The preprocessor is no longer in the pipeline! It's pre-computed.
    full_pipeline_gs = Pipeline([
        ('model', model_instance)
    ])

    try:
        # Check for NaiveBayes and handle its incompatibility with PCA
        if model_name == 'NaiveBayes':
            print(f"Skipping GridSearchCV for {model_name} as it is incompatible with PCA.")
            continue

        grid_search = GridSearchCV(
            full_pipeline_gs, 
            param_grid, 
            cv=3, 
            scoring='accuracy', 
            n_jobs=4,  # Use a specific number of jobs to prevent memory overload
            verbose=1
        )
        
        # Fit GridSearchCV on the pre-computed PCA data
        grid_search.fit(X_train_pca, y_train)
        
        best_models[model_name] = grid_search.best_estimator_
        
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        best_score = grid_search.score(X_val_pca, y_val)
        print(f"Validation Accuracy for the best {model_name} model: {best_score:.4f}")
        
    except Exception as e:
        print(f"An error occurred while running GridSearchCV for {model_name}: {e}")


--- Implementation 3: Pipeline(TF-IDF + StandardScaler -> PCA -> Model) with GridSearchCV ---

Performing GridSearchCV for the LogisticRegression model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_pred

Best parameters for LogisticRegression: {'model__C': 10.0, 'model__solver': 'lbfgs'}
Validation Accuracy for the best LogisticRegression model: 0.6111

Performing GridSearchCV for the XGBoost model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best parameters for XGBoost: {'model__learning_rate': 0.2, 'model__n_estimators': 200}
Validation Accuracy for the best XGBoost model: 0.6267

Performing GridSearchCV for the RandomForest model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




Best parameters for RandomForest: {'model__max_depth': None, 'model__n_estimators': 200}
Validation Accuracy for the best RandomForest model: 0.6028

Performing GridSearchCV for the KNN model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




In [None]:

print("\n--- All Implementations Complete ---")

# Modelling

## Traditional Machine Learning Model

### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression 
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import Pipeline

import multiprocessing as mp
mp.set_start_method("spawn", force=True)

import matplotlib.pyplot as plt

#Model intialization
#lr_model = LogisticRegression()

# Training the model
#lr_model.fit(X_train,y_train)
#print("Number of iterations performed:", lr_model.n_iter_)

# Make predictions and evaluate
#y_pred = lr_model.predict(X_val)

# Applying Tfidf to word column  -  text_features, applying StandardScaler for other numerical features.

In [2]:
# --- 3. Define the shared preprocessing and PCA steps ---
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500, stop_words='english'), text_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='drop'
)
"""
final_n_components = 83

pipeline_PCA = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=final_n_components)),
]) """

# --- 4. Create a dictionary of models to experiment with ---
models = {
    'LogisticRegression': LogisticRegression(),
    'XGBoost': XGBClassifier(),
    'RandomForest': RandomForestClassifier(),
    'NaiveBayes': ComplementNB(),
    'KNN': KNeighborsClassifier()
}

# --- 5. Loop through the models, create a pipeline, fit, and score ---
print("--- Starting Model Experimentation ---")
for model_name, model_instance in models.items():
    print(f"\nTraining and evaluating the {model_name} model...")
    
    # Create the full pipeline for the current model
    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model_instance)
    ])
    
    try:
        # Fit the pipeline on the training data
        full_pipeline.fit(X_train, y_train)
        
        # Evaluate the model on the validation data
        score = full_pipeline.score(X_val, y_val)
        
        print(f"Accuracy for {model_name}: {score:.4f}")
    
    except Exception as e:
        print(f"An error occurred while training {model_name}: {e}")


NameError: name 'ColumnTransformer' is not defined

# Applying PCA to evaluate potential improvements in efficiency.

In [None]:


# --- 3. Define the shared preprocessing and PCA steps ---
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(max_features=500, stop_words='english'), text_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='drop'
)

final_n_components = 83

pipeline_PCA = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=final_n_components)),
])

# --- 4. Create a dictionary of models to experiment with ---
models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'XGBoost': XGBClassifier(random_state=42),
    'RandomForest': RandomForestClassifier(random_state=42),
    # Note: Naive Bayes works best with non-negative data. PCA can produce negative values, so this may not work.
    #'NaiveBayes': ComplementNB(), 
    # KNN is very slow on high-dimensional data, so it is commented out for faster execution.
    # 'KNN': KNeighborsClassifier(n_neighbors=5)
}

# --- 5. Loop through the models, create a pipeline, fit, and score ---
print("--- Starting Model Experimentation ---")
for model_name, model_instance in models.items():
    print(f"\nTraining and evaluating the {model_name} model...")
    
    # Create the full pipeline for the current model
    full_pipeline = Pipeline([
        ('preprocessing_and_pca', pipeline_PCA),
        ('model', model_instance)
    ])
    
    try:
        # Fit the pipeline on the training data
        full_pipeline.fit(X_train, y_train)
        
        # Evaluate the model on the validation data
        score = full_pipeline.score(X_val, y_val)
        
        print(f"Accuracy for {model_name}: {score:.4f}")
    
    except Exception as e:
        print(f"An error occurred while training {model_name}: {e}")


# Applying GridSearchCV to evaluate potential improvements in efficiency.

In [None]:
# --- 2. Define the shared preprocessing and PCA steps ---
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(stop_words='english'), text_features),
        ('num', StandardScaler(), numerical_features)
    ],
    remainder='drop'
)

# Note: The number of components for PCA can also be a hyperparameter to tune.
# We'll include it in the param_grids for a more thorough search.
pipeline_PCA = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=83)),
])

# --- 3. Create a dictionary of models to experiment with ---
models = {
    'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'RandomForest': RandomForestClassifier(random_state=42),
    # Note: ComplementNB works best with non-negative data. PCA can produce negative values, which may cause an error.
    # We will try it, but be aware of the potential issue. 
    'KNN': KNeighborsClassifier(),
    'NaiveBayes': MultinomialNB()
}

# --- 4. Define parameter grids for each model ---
# Use the __ (double underscore) to specify pipeline step and parameter
param_grids = {
    'LogisticRegression': {
        'model__C': [0.1, 1.0, 10.0],
        'model__solver': ['liblinear', 'lbfgs']
    },
    'XGBoost': {
        'model__n_estimators': [100, 200],
        'model__learning_rate': [0.05, 0.1, 0.2]
    },
    'RandomForest': {
        'model__n_estimators': [100, 200],
        'model__max_depth': [10, 20, None]
    },
    'KNN': {
        'model__n_neighbors': [3, 5, 7],
        'model__weights': ['uniform', 'distance']
    },
    'NaiveBayes': {
        'model__alpha': [0.1, 0.5, 1.0]
    }
}

# --- 5. Loop through the models, apply GridSearchCV, and evaluate ---
print("--- Starting Model Experimentation with GridSearchCV ---")
best_models = {}

for model_name, model_instance in models.items():
    print(f"\nPerforming GridSearchCV for the {model_name} model...")
    
    param_grid = param_grids.get(model_name, {})
    
    if not param_grid:
        print(f"No parameter grid defined for {model_name}. Skipping...")
        continue

    # Create the full pipeline for the current model
    full_pipeline = Pipeline([
        ('preprocessing_and_pca', pipeline_PCA),
        ('model', model_instance)
    ])
    
    try:
        # Create the GridSearchCV object
        grid_search = GridSearchCV(
            full_pipeline, 
            param_grid, 
            cv=3, 
            scoring='accuracy', 
            n_jobs=-1, 
            verbose=1
        )
        
        # Fit GridSearchCV on the training data
        grid_search.fit(X_train, y_train)
        
        # Store the best estimator found
        best_models[model_name] = grid_search.best_estimator_
        
        # Print the results
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        
        # Evaluate the best model on the validation data
        best_score = grid_search.score(X_val, y_val)
        print(f"Validation Accuracy for the best {model_name} model: {best_score:.4f}")
        
    except Exception as e:
        print(f"An error occurred while running GridSearchCV for {model_name}: {e}")

print("\n--- GridSearchCV Complete ---")

--- Starting Model Experimentation with GridSearchCV ---

Performing GridSearchCV for the LogisticRegression model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x 

Best parameters for LogisticRegression: {'model__C': 1.0, 'model__solver': 'lbfgs'}
Validation Accuracy for the best LogisticRegression model: 0.6303

Performing GridSearchCV for the XGBoost model...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matvec=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
  matmat=lambda x: X @ x - offset @ x,
  matmat=lambda x: X @ x - offset @ x,
  matmat

In [None]:
# --- 6. Example of using GridSearchCV to find the best hyperparameters ---
print("\n--- Starting GridSearchCV for RandomForestClassifier ---")

# Define the pipeline for GridSearch
rf_pipeline = Pipeline([
    ('preprocessing_and_pca', pipeline_PCA),
    ('model', RandomForestClassifier(random_state=42))
])

# Define the parameter grid to search over.
# Note the naming convention: 'step_name__parameter_name'
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_leaf': [1, 2]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    cv=3,  # 3-fold cross-validation
    n_jobs=-1, # Use all available CPU cores
    verbose=2 # Verbosity level
)

try:
    # Fit the grid search to the training data
    grid_search.fit(X_train, y_train)
    
    # Print the best parameters and the best score
    print("\nBest parameters found:")
    print(grid_search.best_params_)
    print(f"\nBest cross-validation accuracy: {grid_search.best_score_:.4f}")
    
    # Evaluate the best estimator on the validation set
    best_rf_score = grid_search.best_estimator_.score(X_val, y_val)
    print(f"Validation accuracy of the best model: {best_rf_score:.4f}")

except Exception as e:
    print(f"An error occurred during Grid Search: {e}")

In [None]:
model_logistic_reg = Pipeline([
    ('pipeline_PCA',pipeline_PCA),
('logistic_regression',LogisticRegression(random_state=42)),
])
#Model intialization
lr_model = LogisticRegression()

# --- 4. Fit the entire pipeline on the training data ONLY ---
print("\nFitting the complete pipeline on the training data...")
lr_model.fit(X_train, y_train)

# --- 5. Evaluate the model on the unseen test data ---
score = lr_model.score(X_val, y_val)
print(f"Model accuracy on the test set: {score:.4f}")

In [None]:
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred))

# KNN (K-Nearest Neighbor)

In [None]:
from sklearn.neighbors import KNeighborsClassifier 

#Model intialization
knn_model = KNeighborsClassifier()

# Training the model
knn_model.fit(X_train,y_train)

# Make predictions and evaluate
y_pred = lr_model.predict(X_val)

In [None]:

print("KNN Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# 1. Generate the confusion matrix
cm = confusion_matrix(y_val, y_pred)

# 2. Get the unique labels from the test set for display
class_labels = y_val.unique()

# 3. Create a ConfusionMatrixDisplay object
# The 'display_labels' argument provides the labels for the axes.
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)

# 4. Plot the confusion matrix
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()


### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

#Model intialization
nb_model = MultinomialNB()

# Training the model
nb_model.fit(X_train,y_train)
print("Number of iterations performed:", lr_model.n_iter_)

# Make predictions and evaluate
y_pred = nb_model.predict(X_val)

In [None]:


print("Multinomial Naive Bayes Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred))

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

#Model intialization
rf_model = RandomForestClassifier()

# Training the model
rf_model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred_rf = rf_model.predict(X_val)

In [None]:
print("Random Forest Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_rf):.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred_rf))


# XGBoost

In [None]:
!pip install xgboost

In [None]:
# can you find any feature which is indicating data leakage, what are the most important feature which contributed to accuracy, find the feature importance tells which made XGboost better. correlation analysos
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# 1. Initialize LabelEncoder
le = LabelEncoder()

# 2. Fit the encoder on the training labels and transform both training and test labels
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val) # Note: Corrected y_test to y_val as per your variable name

# 3. Initialize the XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='multi:softprob',
                              eval_metric='mlogloss')

# 4. Train the model with the NUMERICAL labels
# This is the crucial fix: use y_train_encoded
xgb_model.fit(X_train, y_train_encoded)

# 5. Make predictions
# Make sure you are passing the correct validation data
y_pred_encoded = xgb_model.predict(X_val)

# 6. Decode the predictions back to their original labels for readability
y_pred = le.inverse_transform(y_pred_encoded)

# 7. Evaluate the model using the original labels
print("XGBoost Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Gradient Boost Classifier

In [None]:
"""from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Train the model
gb_model.fit(X_train, y_train)

# Make predictions
y_pred = gb_model.predict(X_test)

# Evaluate the model
print("Gradient Boosting Classifier Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred)) """

### Support Vector Machines (SVMs)

In [None]:
from sklearn.svm import SVC

#Model intialization
#svc_model = SVC()

# Training the model
#svc_model.fit(X_train,y_train)

# Make predictions and evaluate
#y_pred = svc_model.predict(X_val)

In [None]:
""" print("SVM Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_val, y_pred)) """

# Kfold validation and Hyperparameter Tuning and their performance using pipeline

In [None]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import Pipeline

import multiprocessing as mp
mp.set_start_method("spawn", force=True)

In [None]:
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

# Define models and their hyperparameter grids
models_and_params = {
    'Logistic Regression': (
        LogisticRegression(max_iter=1000, random_state=42),
        {'model__C': [0.1, 1, 10, 100], 'model__penalty': ['l1', 'l2']}
    ),
    'Multinomial Naive Bayes': (
        MultinomialNB(),
        {'model__alpha': [0.1, 0.5, 1.0, 2.0]}
    ),
    'Random Forest Classifier': (
        RandomForestClassifier(random_state=42),
        {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 10, 20]}
    ),
    'Gradient Boosting Classifier': (
        GradientBoostingClassifier(random_state=42),
        {'model__n_estimators': [50, 100], 'model__learning_rate': [0.05, 0.1]}
    ),
    'XGBoost Classifier': (
        xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', use_label_encoder=False, random_state=42),
        {'model__n_estimators': [50, 100], 'model__learning_rate': [0.05, 0.1]}
    ),
    'Support Vector Machine (SVC)': (
        SVC(random_state=42),
        {'model__C': [0.1, 1, 10], 'model__kernel': ['linear', 'rbf']}
    )
}

# 4. Set up K-Fold Cross-Validation
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

# 5. Loop through models, perform tuning, and evaluate
for name, (model, params) in models_and_params.items():
    print(f"\n{'='*50}\nStarting hyperparameter tuning for: {name}\n{'='*50}")

    # Use the appropriate target variable for XGBoost
    if name == 'XGBoost Classifier':
        y_train_target = y_train_encoded
        y_val_target = y_val_encoded
        # You will need to uncomment this line for XGBoost prediction later
        y_val_original = y_val
    else:
        y_train_target = y_train
        y_val_target = y_val
    
    # Create a pipeline including the TfidfVectorizer.
    # This prevents data leakage during cross-validation.
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
        ('model', model)
    ])

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=params,
        cv=cv_strategy,
        scoring='accuracy',
        n_jobs=-1,  # Use all available cores
        verbose=1
    )

    # Fit the grid search on the raw text data (X_train)
    grid_search.fit(X_train, y_train_target)

    # Print results
    print(f"Best parameters found for {name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

    # Evaluate on the validation set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_val)
    
    # If using XGBoost, decode the predictions back to original labels
    if name == 'XGBoost Classifier':
        y_pred = le.inverse_transform(y_pred)
        y_val_true = y_val_original
    else:
        y_val_true = y_val

    print("\nValidation Set Performance:")
    print(f"Accuracy: {accuracy_score(y_val_true, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_val_true, y_pred, zero_division=0))


In [None]:
# Optimization technique using GridSearchCV
#Define models and their hyperparameter grids
models_and_params = {
    'Logistic Regression': (
        LogisticRegression(max_iter=1000, random_state=42),
        {'model__C': [0.1, 1, 10, 100], 'model__penalty': ['l1', 'l2']}
    ),
    'Multinomial Naive Bayes': (
        MultinomialNB(),
        {'model__alpha': [0.1, 0.5, 1.0, 2.0]}
    ),
    'Random Forest Classifier': (
        RandomForestClassifier(random_state=42),
        {'model__n_estimators': [50, 100, 200], 'model__max_depth': [None, 10, 20]}
    ),
   # 'Gradient Boosting Classifier': (
   #     GradientBoostingClassifier(random_state=42),
   #     {'model__n_estimators': [50, 100], 'model__learning_rate': [0.05, 0.1]}
   # ),
    'XGBoost Classifier': (
        xgb.XGBClassifier(objective='multi:softprob', eval_metric='mlogloss', use_label_encoder=False, random_state=42),
        {'model__n_estimators': [50, 100], 'model__learning_rate': [0.05, 0.1]}
    )#,
    #'Support Vector Machine (SVC)': (
    #    SVC(random_state=42),
    #    {'model__C': [0.1, 1, 10], 'model__kernel': ['linear', 'rbf']}
    #)
}

# 4. Set up K-Fold Cross-Validation
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

# 5. Loop through models, perform tuning, and evaluate
for name, (model, params) in models_and_params.items():
    print(f"\n{'='*50}\nStarting hyperparameter tuning for: {name}\n{'='*50}")

    # Use the appropriate target variable for XGBoost
    if name == 'XGBoost Classifier':
        y_train_target = y_train_encoded
        y_test_target = y_val_encoded
    else:
        y_train_target = y_train
        y_test_target = y_val

    # Create a pipeline
    pipeline = Pipeline([
        #('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
        ('model', model)
    ])

    # Initialize GridSearchCV
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=params,
        cv=cv_strategy,
        scoring='accuracy',
        n_jobs=-1,  # Use all available cores
        verbose=1
    )

    # Fit the grid search
    grid_search.fit(X_train, y_train_target)

    # Print results
    print(f"Best parameters found for {name}: {grid_search.best_params_}")
    print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

    # Evaluate on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_val)
    
    # If using XGBoost, decode the predictions back to original labels
    if name == 'XGBoost Classifier':
        y_pred = le.inverse_transform(y_pred)
    
    print("\nTest Set Performance:")
    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
    print("Classification Report:")
    print(classification_report(y_val, y_pred))


In [None]:
!pip install hyperopt

In [None]:
from sklearn.model_selection import KFold, cross_val_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll import scope

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

# Define a single objective function for all models
def objective(params):
    """Objective function for Hyperopt.
    This function takes hyperparameters and returns a loss (negative accuracy)
    using K-Fold cross-validation.
    """
    model_name = params.pop('model_type')
    
    if model_name == 'LogisticRegression':
        model = LogisticRegression(**params, solver='liblinear', random_state=42)
    elif model_name == 'RandomForestClassifier':
        model = RandomForestClassifier(**params, random_state=42)
    elif model_name == 'GradientBoostingClassifier':
        model = GradientBoostingClassifier(**params, random_state=42)
    elif model_name == 'MultinomialNB':
        model = MultinomialNB(**params)
    elif model_name == 'SVC':
        model = SVC(**params, random_state=42)
    elif model_name == 'XGBClassifier':
        model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=42)
    else:
        raise ValueError("Invalid model type specified.")

    # Perform K-Fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # We use negative accuracy as the loss to minimize
    score = cross_val_score(model, X_train, y_train_target, cv=kf, scoring='accuracy', n_jobs=-1).mean()
    loss = 1 - score
    
    return {'loss': loss, 'status': STATUS_OK, 'model_name': model_name, 'params': params}

# 4. Define the search space for each model's hyperparameters
space = hp.choice('classifier_type', [
    {
        'model_type': 'LogisticRegression',
        'C': hp.loguniform('C_logreg', np.log(0.001), np.log(100.0)),
        'penalty': hp.choice('penalty_logreg', ['l1', 'l2'])
    },
    {
        'model_type': 'MultinomialNB',
        'alpha': hp.uniform('alpha_mnb', 0.0, 1.0)
    },
    {
        'model_type': 'RandomForestClassifier',
        'n_estimators': scope.int(hp.quniform('n_estimators_rf', 10, 200, 10)),
        'max_depth': scope.int(hp.quniform('max_depth_rf', 3, 20, 1)),
        'min_samples_split': hp.uniform('min_samples_split_rf', 0.1, 1.0),
        'min_samples_leaf': hp.uniform('min_samples_leaf_rf', 0.1, 0.5)
    },
    #{
    #    'model_type': 'GradientBoostingClassifier',
    #    'n_estimators': scope.int(hp.quniform('n_estimators_gb', 50, 250, 10)),
    #    'learning_rate': hp.loguniform('learning_rate_gb', np.log(0.01), np.log(0.5)),
    #    'max_depth': scope.int(hp.quniform('max_depth_gb', 2, 10, 1))
    #},
    {
        'model_type': 'XGBClassifier',
        'n_estimators': scope.int(hp.quniform('n_estimators_xgb', 50, 250, 10)),
        'learning_rate': hp.loguniform('learning_rate_xgb', np.log(0.01), np.log(0.5)),
        'max_depth': scope.int(hp.quniform('max_depth_xgb', 2, 10, 1))
    }#,
    #{
     #   'model_type': 'SVC',
     #   'C': hp.loguniform('C_svc', np.log(0.01), np.log(100)),
     #   'kernel': hp.choice('kernel_svc', ['linear', 'rbf'])
    #}
])

# 5. Run the optimization
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,  # Number of different hyperparameter combinations to test
    trials=trials
)

# 6. Extract and print the best results
best_result = sorted(trials.results, key=lambda x: x['loss'])[0]

print("\n--- Hyperopt Optimization Results ---")
print(f"Best Model: {best_result['model_name']}")
print(f"Best Hyperparameters: {best_result['params']}")
print(f"Best Validation Accuracy: {1 - best_result['loss']:.4f}")