In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import train_test_split

# Load dataset
filename = '/content/ReducedTweets.csv'
dataset = pd.read_csv(filename)

# NaN values
cols_with_nan = dataset.columns[dataset.isna().any()].tolist()
for col in cols_with_nan:
    if dataset[col].isna().any():
        dataset[col] = dataset.groupby('Class')[col].transform(lambda x: x.fillna(x.mode().iloc[0]))
    else:
        print(f"")

X = dataset.iloc[:, 1:16].values
y = dataset.iloc[:, 16].values

# Get the names of features
feature_names = dataset.columns[1:43]

# Hyperparameters for SVM RBF
svm_rbf_parameters = {'C': [10**i for i in range(-3, 0, 10)], 'gamma': [10**i for i in range(-3, 0, 1)]}

# Split the data into training, testing, and validation sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Cross-validation strategy
outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Initialize performance metric lists
svm_rbf_accuracy_scores = []
svm_rbf_precision_scores = []
svm_rbf_recall_scores = []
svm_rbf_f1_scores = []

# store feature importance scores
feature_importance_scores = []

# Perform nested cross-validation
for train_index, test_index in outer_cv.split(X_train, y_train):
    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

    # SVM RBF
    svm_rbf_classifier = SVC(kernel='rbf', random_state=42)
    svm_rbf_grid_search = GridSearchCV(svm_rbf_classifier, svm_rbf_parameters, cv=inner_cv)
    svm_rbf_grid_search.fit(X_train, y_train)
    svm_rbf_best_params = svm_rbf_grid_search.best_params_
    svm_rbf_best_model = SVC(kernel='rbf', random_state=42, **svm_rbf_best_params)

    # Ablation Study
    ablation_scores = []
    for feature_index in range(X_train.shape[1]):
        ablated_X_train = np.delete(X_train, feature_index, axis=1)
        ablated_X_test = np.delete(X_test, feature_index, axis=1)
        svm_rbf_best_model.fit(ablated_X_train, y_train)
        svm_rbf_predictions = svm_rbf_best_model.predict(ablated_X_test)
        ablation_scores.append(accuracy_score(y_test, svm_rbf_predictions))

    # Store feature importance scores
    feature_importance_scores.append(ablation_scores)

    # Evaluate SVM RBF performance metrics
    svm_rbf_accuracy_scores.append(np.mean(ablation_scores))
    svm_rbf_precision_scores.append(precision_score(y_test, svm_rbf_predictions))
    svm_rbf_recall_scores.append(recall_score(y_test, svm_rbf_predictions))
    svm_rbf_f1_scores.append(f1_score(y_test, svm_rbf_predictions))

# Calculate average performance
svm_rbf_average_accuracy= np.mean(svm_rbf_accuracy_scores)
svm_rbf_average_precision = np.mean(svm_rbf_precision_scores)
svm_rbf_average_recall = np.mean(svm_rbf_recall_scores)
svm_rbf_average_f1 = np.mean(svm_rbf_f1_scores)
svm_rbf_std_accuracy= np.std(svm_rbf_accuracy_scores)
svm_rbf_std_precision = np.std(svm_rbf_precision_scores)
svm_rbf_std_recall = np.std(svm_rbf_recall_scores)
svm_rbf_std_f1 = np.std(svm_rbf_f1_scores)

# Print
print("\nSVM RBF:")
print("Average Accuracy:", svm_rbf_average_accuracy)
print("Standard Deviation Accuracy:", svm_rbf_std_accuracy)
print("Average Precision:", svm_rbf_average_precision)
print("Standard Deviation Precision:", svm_rbf_std_precision)
print("Average Recall:", svm_rbf_average_recall)
print("Standard Deviation Recall:", svm_rbf_std_recall)
print("Average F1-score:", svm_rbf_average_f1)
print("Standard Deviation F1-score:", svm_rbf_std_f1)


SVM RBF:
Average Accuracy: 0.7253333333333333
Standard Deviation Accuracy: 0.0
Average Precision: 0.7352537722908093
Standard Deviation Precision: 0.0
Average Recall: 0.7071240105540897
Standard Deviation Recall: 0.0
Average F1-score: 0.7209145931405514
Standard Deviation F1-score: 0.0


In [None]:
# Select the best-performing features
best_feature_indices = np.argsort(np.mean(feature_importance_scores, axis=0))[::-1]
num_features_to_display = 10

X_best_features = X[:, best_feature_indices[:num_features_to_display]]

# Calculate and print feature importance with feature names
average_importance_scores = np.mean(feature_importance_scores, axis=0)
sorted_indices = np.argsort(average_importance_scores)[::-1]

print("\nTop Feature Importance with Scores:")
for i in range(num_features_to_display):
    feature_index = sorted_indices[i]
    importance_score = round(average_importance_scores[feature_index], 5)
    print(f"{feature_names[feature_index]}: {importance_score}")


Top Feature Importance with Scores:
total_characters: 0.74133
coleman_liau_index: 0.72867
smog_index: 0.72533
num_lowercase_words: 0.72533
count_words: 0.72333
count_unique_words: 0.72333
count_blanks: 0.72333
gunning_fog: 0.72267
count_numbers: 0.72267
flesch_kincaid_grade_level: 0.72267
