In [6]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values  # Convert to NumPy array
y = data['label'].values  # Convert to NumPy array

# Apply PCA to understand variance capture
pca = PCA().fit(X)
explained_variances = np.cumsum(pca.explained_variance_ratio_)

# Range of PCA components to test
components_range = np.linspace(0.7, 0.95, 7)  # Testing from 70% to 100% variance
avg_test_f1_scores = []

# Iterate through different PCA components
for n_components in components_range:
    # Apply PCA
    pca = PCA(n_components=n_components)  # Keep specified variance
    X_pca = pca.fit_transform(X)
    
    # Initialize variables to track performance
    fold_test_f1_scores = []

    # Define the k-fold cross-validator
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Perform cross-validation
    for train_index, val_index in kf.split(X_pca):
        X_train_fold, X_val_fold = X_pca[train_index], X_pca[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        # Initialize the LightGBM classifier
        lgbm = lgb.LGBMClassifier(
            n_estimators=100,
            learning_rate=0.1,
            boosting_type='gbdt',
            num_leaves=31,
            random_state=42
        )

        # Train the classifier
        lgbm.fit(X_train_fold, y_train_fold)

        # Predict on the validation set
        y_val_pred = lgbm.predict(X_val_fold)

        # Calculate the F1 score on the validation set
        fold_test_f1_scores.append(f1_score(y_val_fold, y_val_pred, average='macro'))

    # Store the average F1 score for the folds
    avg_test_f1_scores.append(np.mean(fold_test_f1_scores))

# Plot the results
plt.figure(figsize=(12, 6))

# Plot F1 Score vs. PCA Components
plt.subplot(1, 2, 1)
plt.plot(components_range, avg_test_f1_scores, marker='o')
plt.xlabel('PCA n_components')
plt.ylabel('Average Macro F1 Score')
plt.title('PCA Components vs. Macro F1 Score')
plt.grid(True)

# Plot Explained Variance
plt.subplot(1, 2, 2)
plt.plot(np.arange(1, len(explained_variances)+1), explained_variances, marker='o')
plt.xlabel('Number of PCA Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance vs. Number of PCA Components')
plt.grid(True)

plt.tight_layout()
plt.show()


[LightGBM] [Info] Number of positive: 5267, number of negative: 8480
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307020
[LightGBM] [Info] Number of data points in the train set: 13747, number of used features: 1204
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383138 -> initscore=-0.476250
[LightGBM] [Info] Start training from score -0.476250
[LightGBM] [Info] Number of positive: 5272, number of negative: 8475
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.053764 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 307020
[LightGBM] [Info] Number of data points in the train set: 13747, number of used features: 1204
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383502 -> initscore=-0.474711
[LightGBM] [Info] Start training from score -0.474711
[LightGB

InvalidParameterError: The 'n_components' parameter of PCA must be an int in the range [0, inf), a float in the range (0.0, 1.0), a str among {'mle'} or None. Got 1.0 instead.

In [4]:
# Print the shapes of the original and transformed data to verify dimensionality
print("Original shape:", X.shape)
print("PCA-transformed shape:", X_pca.shape)

Original shape: (17184, 5000)
PCA-transformed shape: (17184, 3440)


In [5]:

# List of random seeds and boosting types
random_seeds = [42, 52, 62]
boosting_types = ['gbdt', 'dart']

# Fixed parameters
n_estimators = 100
num_leaves = 63

# Initialize variables to track the best configuration
best_f1_score = 0
best_params = {}

# Store results for all configurations
results = []

# Iterate over each boosting type
for boosting_type in boosting_types:
    train_f1_scores = []
    test_f1_scores = []

    for seed in random_seeds:
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=seed)

        # Define the k-fold cross-validator
        kf = KFold(n_splits=5, shuffle=True, random_state=seed)

        # Prepare arrays to collect F1 scores
        fold_train_f1_scores = []
        fold_test_f1_scores = []

        # Perform cross-validation
        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            # Initialize the LightGBM classifier
            lgbm = lgb.LGBMClassifier(
                n_estimators=n_estimators,
                learning_rate=0.1,
                boosting_type=boosting_type,
                num_leaves=num_leaves,
                random_state=seed
            )

            # Train the classifier
            lgbm.fit(X_train_fold, y_train_fold)

            # Predict on the validation set
            y_val_pred = lgbm.predict(X_val_fold)
            y_train_pred = lgbm.predict(X_train_fold)

            # Calculate the F1 score on the validation and training sets
            fold_train_f1_scores.append(f1_score(y_train_fold, y_train_pred, average='macro'))
            fold_test_f1_scores.append(f1_score(y_val_fold, y_val_pred, average='macro'))

        # Store the average F1 scores for the folds
        train_f1_scores.append(np.mean(fold_train_f1_scores))
        test_f1_scores.append(np.mean(fold_test_f1_scores))

    # Calculate and store the average F1 scores for this boosting type
    average_train_f1_score = np.mean(train_f1_scores)
    average_test_f1_score = np.mean(test_f1_scores)
    results.append((boosting_type, average_train_f1_score, average_test_f1_score))

    # Update best parameters if current average F1 score is better
    if average_test_f1_score > best_f1_score:
        best_f1_score = average_test_f1_score
        best_params = {
            'boosting_type': boosting_type
        }

# Print all results at the end
print("\nAll Results:")
for boosting_type, train_f1_score, test_f1_score in results:
    print(f"Boosting Type: {boosting_type}, Average Train F1 Score (Macro): {train_f1_score:.2f}, Average Test F1 Score (Macro): {test_f1_score:.2f}")

# Print the best configuration
print(f"\nBest F1 Score (Macro): {best_f1_score:.2f}")
print(f"Best Parameters: {best_params}")

[LightGBM] [Info] Number of positive: 4203, number of negative: 6794
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.246773 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 877200
[LightGBM] [Info] Number of data points in the train set: 10997, number of used features: 3440
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382195 -> initscore=-0.480241
[LightGBM] [Info] Start training from score -0.480241
[LightGBM] [Info] Number of positive: 4205, number of negative: 6792
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.221765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 877200
[LightGBM] [Info] Number of data points in the train set: 10997, number of used features: 3440
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.382377 -> initscore=-0.479471
[LightGBM] [Info] Start training from score -0.479471
[LightGB

KeyboardInterrupt: 

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Load your dataset
data = pd.read_csv('data/train_tfidf_features.csv')

# Exclude the first column and use the second column as the label
X = data.drop(['label', 'id'], axis=1).values  # Convert to NumPy array
y = data['label'].values  # Convert to NumPy array

# Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% of the variance
X_pca = pca.fit_transform(X)

# Print the shapes of the original and transformed data to verify dimensionality
print("Original shape:", X.shape)
print("PCA-transformed shape:", X_pca.shape)

# List of random seeds and colsample_bytree values to test
random_seeds = [42, 52, 62]
colsample_bytree_values = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Fixed parameters
n_estimators = 100
num_leaves = 63
boosting_type = 'gbdt'

# Initialize variables to track the best configuration
best_f1_score = 0
best_params = {}

# Store results for all configurations
results = []

# Iterate over each colsample_bytree value
for colsample_bytree in colsample_bytree_values:
    test_f1_scores = []

    for seed in random_seeds:
        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=seed)

        # Initialize the LightGBM classifier
        lgbm = lgb.LGBMClassifier(
            n_estimators=n_estimators,
            learning_rate=0.1,
            boosting_type=boosting_type,
            num_leaves=num_leaves,
            colsample_bytree=colsample_bytree,
            random_state=seed
        )

        # Train the classifier
        lgbm.fit(X_train, y_train)

        # Predict on the test set
        y_test_pred = lgbm.predict(X_test)

        # Calculate the F1 score on the test set
        test_f1_score = f1_score(y_test, y_test_pred, average='macro')
        test_f1_scores.append(test_f1_score)

    # Calculate and store the average F1 score for this colsample_bytree value
    average_f1_score = np.mean(test_f1_scores)
    results.append((colsample_bytree, average_f1_score))

    # Update best parameters if current average F1 score is better
    if average_f1_score > best_f1_score:
        best_f1_score = average_f1_score
        best_params = {
            'colsample_bytree': colsample_bytree
        }

# Print all results at the end
print("\nAll Results:")
for colsample_bytree, f1_score in results:
    print(f"colsample_bytree: {colsample_bytree}, Average F1 Score (Macro): {f1_score:.2f}")

# Print the best configuration
print(f"\nBest F1 Score (Macro): {best_f1_score:.2f}")
print(f"Best Parameters: {best_params}")

# Plot the results
colsample_bytree_values, average_f1_scores = zip(*results)
plt.plot(colsample_bytree_values, average_f1_scores, marker='o')
plt.xlabel('colsample_bytree')
plt.ylabel('Average F1 Score (Macro)')
plt.title('Effect of colsample_bytree on F1 Score')
plt.grid(True)
plt.show()

Original shape: (17184, 5000)
PCA-transformed shape: (17184, 3440)
[LightGBM] [Info] Number of positive: 5267, number of negative: 8480
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.162524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 877200
[LightGBM] [Info] Number of data points in the train set: 13747, number of used features: 3440
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.383138 -> initscore=-0.476250
[LightGBM] [Info] Start training from score -0.476250
[LightGBM] [Info] Number of positive: 5221, number of negative: 8526
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.151526 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 877200
[LightGBM] [Info] Number of data points in the train set: 13747, number of used features: 3440
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.379792 -> initscore=-0.49

KeyboardInterrupt: 