In [None]:
#Random Forest
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
dataset = pd.read_csv('/content/ReducedTweets.csv')

# NaN values
cols_with_nan = dataset.columns[dataset.isna().any()].tolist()
for col in cols_with_nan:
    if dataset[col].isna().any():
        dataset[col] = dataset.groupby('Class')[col].transform(lambda x: x.fillna(x.mode().iloc[0]))
    else:
        print(f"")

X = dataset.iloc[:, 1:16].values
y = dataset.iloc[:, 16].values

# Define feature_names
feature_names = dataset.columns[1:43]

# Split the data into training, testing, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Define the hyperparameters
rf_parameters = {'max_depth': range(1, 5), 'n_estimators': [20, 50, 100]}

# Initialize performance metric lists
rf_accuracy_scores = []
rf_precision_scores = []
rf_recall_scores = []
rf_f1_scores = []

# Initialize a list to store feature importance scores
feature_importance_scores = []

# Random Forest
rf_classifier = RandomForestClassifier(random_state=42)
rf_grid_search = GridSearchCV(rf_classifier, rf_parameters, cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42))
rf_grid_search.fit(X_train, y_train)
rf_best_params = rf_grid_search.best_params_
rf_best_model = RandomForestClassifier(random_state=42, **rf_best_params)

# Nested cross-validation on the test set
for train_index, val_index in StratifiedKFold(n_splits=10, shuffle=True, random_state=42).split(X_test, y_test):
    X_train_cv, X_val_cv = X_test[train_index], X_test[val_index]
    y_train_cv, y_val_cv = y_test[train_index], y_test[val_index]

    # Ablation Study
    ablation_scores = []
    for feature_index in range(X_train_cv.shape[1]):
        ablated_X_train = np.delete(X_train_cv, feature_index, axis=1)
        ablated_X_val = np.delete(X_val_cv, feature_index, axis=1)
        rf_best_model.fit(ablated_X_train, y_train_cv)
        rf_predictions = rf_best_model.predict(ablated_X_val)
        ablation_scores.append(accuracy_score(y_val_cv, rf_predictions))

    # Store feature importance scores
    feature_importance_scores.append(ablation_scores)

    # Evaluate performance metrics
    rf_accuracy_scores.append(np.mean(ablation_scores))
    rf_precision_scores.append(precision_score(y_val_cv, rf_predictions))
    rf_recall_scores.append(recall_score(y_val_cv, rf_predictions))
    rf_f1_scores.append(f1_score(y_val_cv, rf_predictions))

# Calculate average performance scores
rf_average_accuracy = np.mean(rf_accuracy_scores)
rf_average_precision = np.mean(rf_precision_scores)
rf_average_recall = np.mean(rf_recall_scores)
rf_average_f1 = np.mean(rf_f1_scores)
rf_std_accuracy = np.std(rf_accuracy_scores)
rf_std_precision = np.std(rf_precision_scores)
rf_std_recall = np.std(rf_recall_scores)
rf_std_f1 = np.std(rf_f1_scores)

# Print the average performance scores
print("Average Accuracy:", round(rf_average_accuracy,4))
print("Standard Deviation Accuracy:", round(rf_std_accuracy,4))
print("Average Precision:", round(rf_average_precision,4))
print("Standard Deviation Precision:", round(rf_std_precision,4))
print("Average Recall:", round(rf_average_recall,4))
print("Standard Deviation Recall:", round(rf_std_recall,4))
print("Average F1-score:", round(rf_average_f1,4))
print("Standard Deviation F1-score:", round(rf_std_f1,4))


Average Accuracy: 0.8367
Standard Deviation Accuracy: 0.0271
Average Precision: 0.8672
Standard Deviation Precision: 0.0327
Average Recall: 0.7933
Standard Deviation Recall: 0.0431
Average F1-score: 0.8278
Standard Deviation F1-score: 0.0283


In [None]:
# Select the best-performing features
best_feature_indices_rf = np.argsort(np.mean(feature_importance_scores, axis=0))[::-1]

num_features_to_display_rf = 10
X_best_features_rf = X[:, best_feature_indices_rf[:num_features_to_display_rf]]

# Calculate and print feature importance (average ablation scores) with feature names
average_importance_scores_rf = np.mean(feature_importance_scores, axis=0)
sorted_indices_rf = np.argsort(average_importance_scores_rf)[::-1]

print("\nTop Feature Importance with Scores:")
for i in range(num_features_to_display_rf):
    feature_index_rf = sorted_indices_rf[i]
    importance_score_rf = round(average_importance_scores_rf[feature_index_rf], 5)
    print(f"{feature_names[feature_index_rf]}: {importance_score_rf}")


Top Feature Importance with Scores:
flesch_reading_ease: 0.84267
count_unique_words: 0.84267
num_lowercase_words: 0.84133
count_words: 0.84067
ari: 0.84067
total_characters: 0.84
smog_index: 0.83933
count_numbers: 0.83867
gunning_fog: 0.83733
coleman_liau_index: 0.83667
