In [10]:
pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Load the dataset
file_path = '/users/riyakoduru/Downloads/avg_top10Attribute_allLocation_agg.csv'
data = pd.read_csv(file_path)

# Assuming the last column is the target variable
X = data.iloc[:, :-1]  # All columns except the last one are features
y = data.iloc[:, -1]   # The last column is the target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the SVM model with GridSearchCV for hyperparameter tuning
parameters = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto'],  # Kernel coefficient
    'kernel': ['rbf', 'linear']  # Type of kernel
}

svm_clf = GridSearchCV(SVC(), parameters, cv=5, scoring='accuracy')
svm_clf.fit(X_train_scaled, y_train)

# Display best parameters
print("Best parameters found: ", svm_clf.best_params_)

# Perform cross-validation on training set
cv_scores = cross_val_score(svm_clf, X_train_scaled, y_train, cv=5)
print(f"Cross-validation scores on training set: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean()}")

# Check performance on training set
y_train_pred = svm_clf.predict(X_train_scaled)
print(f"Training Set Accuracy: {accuracy_score(y_train, y_train_pred)}")
print(f"Training Set Classification Report:\n{classification_report(y_train, y_train_pred)}")

# Predict on the test set using best found parameters
y_test_pred = svm_clf.predict(X_test_scaled)
print(f"Test Set Accuracy: {accuracy_score(y_test, y_test_pred)}")
print(f"Test Set Classification Report:\n{classification_report(y_test, y_test_pred)}")


Best parameters found:  {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}




Cross-validation scores on training set: [0.99545455 1.         1.         1.         1.        ]
Mean CV score: 0.9990909090909093
Training Set Accuracy: 0.9990875912408759
Training Set Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.80      0.89         5
         1.0       1.00      1.00      1.00      1091

    accuracy                           1.00      1096
   macro avg       1.00      0.90      0.94      1096
weighted avg       1.00      1.00      1.00      1096

Test Set Accuracy: 1.0
Test Set Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         3
         1.0       1.00      1.00      1.00       468

    accuracy                           1.00       471
   macro avg       1.00      1.00      1.00       471
weighted avg       1.00      1.00      1.00       471



