In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, confusion_matrix
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.exceptions import ConvergenceWarning
import warnings
from xgboost import XGBClassifier


# Task 3 Exploration Of Other Machine Learning Models

In [2]:
# Function to create submission CSV
def create_submission_csv(y_pred, test_id, model_name):
    """
    Creates a CSV file for submission.
    |
    Parameters:
    y_pred (array-like): Predicted labels.
    test_id (array-like): IDs corresponding to the test set.
    model_name (str): Name of the model used for predictions.
    
    Returns:
    str: The name of the created CSV file.
    """
    if len(y_pred) != len(test_id):
        raise ValueError("Length of y_pred and test_id must be the same.")
    
    submission = pd.DataFrame({
        'id': test_id,
        'label': y_pred
    })
    
    submission_file = f'{model_name}.csv'
    submission.to_csv(submission_file, index=False)
    print(f"Submission file created: {submission_file}")
    return submission_file


In [3]:
def cross_validate_model(model, X, y, cv=5):
    # Perform cross-validation
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    f1_scores = cross_val_score(model, X, y, cv=skf, scoring='f1')
    accuracy_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
    recall_scores = cross_val_score(model, X, y, cv=skf, scoring='recall')
    precision_scores = cross_val_score(model, X, y, cv=skf, scoring='precision')
    y_pred_cv = cross_val_predict(model, X, y, cv=skf)

    print(f"Cross-Validation F1 Scores: {f1_scores}")
    print(f"Cross-Validation Accuracy Scores: {accuracy_scores}")
    print(f"Cross-Validation Recall Scores: {recall_scores}")
    print(f"Cross-Validation Precision Scores: {precision_scores}")

    print(f"Mean F1 Score: {np.mean(f1_scores)}")
    print(f"Mean Accuracy: {np.mean(accuracy_scores)}")
    print(f"Mean Recall: {np.mean(recall_scores)}")
    print(f"Mean Precision: {np.mean(precision_scores)}")
    
    # Confusion Matrix for Cross-Validation
    cm = confusion_matrix(y, y_pred_cv)
    print(f"Cross-Validation Confusion Matrix:\n{cm}")

In [5]:
# Load the datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_tfidf = pd.read_csv('train_tfidf_features.csv')
test_tfidf = pd.read_csv('test_tfidf_features.csv')

# Extract IDs from the datasets
train_ids = train_data.iloc[:, 0]  # Assuming the ID is the first column
test_ids = test_data["id"]

# Remove the first two columns from the TF-IDF features
train_features = train_tfidf.iloc[:, 2:]
test_features = test_tfidf.iloc[:, 2:]

# Ensure the columns in test_features are in the same order and have the same names as in train_features
common_columns = [col for col in train_features.columns if col in test_features.columns]
train_features = train_features[common_columns]
test_features = test_features[common_columns]

# Define labels
train_label = train_data["label"]

# Split the train set into train (80%) and validation (20%) sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_label, test_size=0.2, random_state=42)


##  Perform Grid Search to get the best performing parameters

## Model Hyperparameter Tuning

- **n_estimators**: 
  - Controls the number of boosting rounds.
  - A higher number can lead to better performance but increases the risk of overfitting.
  - Tested different values to find a balance between performance and overfitting.

- **learning_rate**:
  - Controls how much each tree contributes to the overall model.
  - Lower learning rates can require more boosting rounds.
  - Tested a range of values to optimize model convergence.

- **max_depth**:
  - Adjusted the depth of the trees to control the model's complexity.
  - Experimented with different depths to balance model performance and complexity.

- **subsample**:
  - Set to 0.5 to randomly sample half of the data for each tree.
  - Helps to prevent overfitting and improve generalization.

- **gamma**:
  - Adds regularization by only splitting nodes when the split results in a significant reduction in loss.
  - Tuned to control the model's complexity and make it more conservative.



In [None]:
# Initialize the base models
# Define the initial XGBClassifier with given parameters
xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.25,
    random_state=42,
    use_label_encoder=False,
    max_depth=7,
    subsample=0.5
)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [500, 700],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7, 9],
    'subsample': 0.5,
    'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5]
}

# Set up GridSearchCV for xgb_model
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',  # Optimize for F1 score
    cv=5,  # 5-fold cross-validation
    verbose=3,
    n_jobs=-1  # Use all available cores
)

In [None]:
# Best parameters obtained from GridSearchCV for logistic regression
best_params = {
    'C': 1,
    'class_weight': 'balanced',
    'max_iter': 1000,  # Increased max_iter to allow for convergence
    'penalty': 'l1',
    'solver': 'liblinear',
    'random_state': 42
}
simple_best_params = {
    'C': 1,
    'class_weight': 'balanced',
    'max_iter': 100,  # Increased max_iter to allow for convergence
    'penalty': 'l1',
    'solver': 'liblinear',
    'random_state': 42,
}

In [None]:
#Initialize other models for exploration
knn_model = KNeighborsClassifier(n_neighbors=10)
rf_model = RandomForestClassifier(random_state=42,n_estimators=1024)
svc_model = SVC(probability=True, random_state=42, kernel='linear')
nb_model = MultinomialNB(class_prior=[0.67,0.3])
bb_model = BernoulliNB()

## Tried different methods for dimensionality reduction and feature selection

In [None]:
pca = TruncatedSVD(n_components=2500)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(test_features)

In [None]:
pca = TruncatedSVD(n_components=600)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(test_features)


In [None]:
# Applying PCA to reduce dimensions with the determined number of components
pca2 = PCA(n_components=400)
X_train_pca2 = pca.fit_transform(X_train)
X_val_pca2 = pca.transform(X_val)
X_test_pca2 = pca.transform(test_features)

In [None]:
pca1 = PCA(n_components=1000)
X_train_pca1 = pca1.fit_transform(X_train)
X_val_pca1 = pca1.transform(X_val)
X_test_pca1 = pca1.transform(test_features)

The sum of explained variance for each of the different parameters are below 50%. It shows that reducing the parameters further is not the right direction

### Final Voting Classifier

In [5]:
best_params = {
    'C': 0.9,
    'class_weight': {0: 1, 1: 2.9},
    'max_iter': 100,  
    'penalty': 'l1',
    'solver': 'liblinear',
    'random_state': 42,
}

# Initialize the base logistic regression model with the best parameters
logreg_model = LogisticRegression(**best_params)
nb_model = MultinomialNB(class_prior=[0.67,0.33])

In [6]:
# Initialize the Best VotingClassifier
voting_model = VotingClassifier(
    estimators=[
        ('logreg', logreg_model),
        ('nb', nb_model)
    ],
    voting='soft',  # 'soft' uses predicted probabilities for averaging
)

In [7]:
voting_model.fit(X_train, y_train)

cross_validate_model(voting_model, X_train, y_train)
# Aggregate predictions on the validation set
y_pred_val = voting_model.predict(X_val)

# Evaluate the model on the validation set
val_f1_score = f1_score(y_val, y_pred_val)
val_accuracy = accuracy_score(y_val, y_pred_val)
val_recall = recall_score(y_val, y_pred_val)
val_precision = precision_score(y_val, y_pred_val)
print(f"Validation F1 Score: {val_f1_score}")
print(f"Validation Recall: {val_recall}")
print(f"Validation Precision: {val_precision}")

# Predict on the test set using the VotingClassifier
y_pred_test = voting_model.predict(test_features)


# Save the test set predictions to a CSV file
create_submission_csv(y_pred_test, test_ids, "voting_predictions")


Cross-Validation F1 Scores: [0.64140218 0.63835878 0.64555766 0.6491897  0.62618596]
Cross-Validation Accuracy Scores: [0.72472727 0.72436364 0.72717352 0.73226628 0.71335031]
Cross-Validation Recall Scores: [0.64231499 0.63472486 0.64862298 0.64672365 0.62678063]
Cross-Validation Precision Scores: [0.64049196 0.64203455 0.64252117 0.65167464 0.62559242]
Mean F1 Score: 0.6401388552754737
Mean Accuracy: 0.7243762029167631
Mean Recall: 0.6398334207315864
Mean Precision: 0.640462946407381
Cross-Validation Confusion Matrix:
[[6588 1892]
 [1897 3370]]
Validation F1 Score: 0.627227910504361
Validation Recall: 0.6440809968847352
Validation Precision: 0.6112342941611234
Submission file created: voting_predictions.csv


'voting_predictions.csv'

# Model Evaluation Metrics

## Cross-Validation Scores

- **F1 Scores**: [0.6414, 0.6384, 0.6456, 0.6492, 0.6262]
- **Accuracy Scores**: [0.7247, 0.7244, 0.7272, 0.7323, 0.7134]
- **Recall Scores**: [0.6423, 0.6347, 0.6486, 0.6467, 0.6268]
- **Precision Scores**: [0.6405, 0.6420, 0.6425, 0.6517, 0.6256]

## Mean Cross-Validation Scores

- **Mean F1 Score**: 0.6401
- **Mean Accuracy**: 0.7244
- **Mean Recall**: 0.6398
- **Mean Precision**: 0.6405

## Cross-Validation Confusion Matrix

| True Negative | False Positive | False Negative | True Positive |
|:-------------:|:--------------:|:--------------:|:-------------:|
| 6588          | 1892           | 1897           | 3370          |

## Validation Metrics

- **Validation F1 Score**: 0.6272
- **Validation Recall**: 0.6441
- **Validation Precision**: 0.6112

## Submission

- Submission file created: **`voting_predictions.csv`**


In [8]:
# Train the VotingClassifier on the PCA-transformed full training data
voting_model.fit(train_features, train_label)


# Predict on the test set using the retrained VotingClassifier
y_pred_test = voting_model.predict(test_features)

# Save the test set predictions to a CSV file
create_submission_csv(y_pred_test, test_ids, "voting_full_predictions")

Submission file created: voting_full_predictions.csv


'voting_full_predictions.csv'

## Final F1 Score on Kaggle

- **Public Scoreboard**: 0.71755
- **Private Scoreboard**: 0.70612
