In [18]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.datasets import load_iris
import joblib
import matplotlib.pyplot as plt


In [19]:
iris = load_iris()
X = pd.DataFrame(iris.data,columns=iris.feature_names)
y = pd.Series(iris.target , name='species')

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy*100:.2f}%")

Model Accuracy: 100.00%


In [22]:
# Save the model    
joblib.dump(svm_model, 'svm_model.pkl')
# Load the model
# loaded_model = joblib.load('svm_model.pkl')

['svm_model.pkl']

In [38]:
# cross-validation (CV) : train the model on different subsets of the data to ensure its robustness and generalizability
# K-fold (a hyperparameter) : split the data into K subsets and train the model K times, each time using a different subset as the validation set and the remaining subsets as the training set
# Risk of overfitting: if the model performs well on the training data but poorly on unseen data
# The final model is typically trained on the entire dataset after cross-validation to maximize the use of available data

# hyperparameter tuning using GridSearchCV or RandomizedSearchCV
param_grid = {
    'C': [0.1, 1, 3, 5, 10],
    'gamma': ['scale' , 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
    }
GD = GridSearchCV(estimator=SVC(), param_grid=param_grid,scoring='accuracy', cv=5)
GD.fit(X_train, y_train)
print("Best parameters found: ", GD.best_params_)
print("Best cross-validation score: {:.2f}".format(GD.best_score_)) 
RD = RandomizedSearchCV(estimator=SVC(), param_distributions=param_grid, n_iter=5, cv=5,scoring='accuracy', random_state=42)
RD.fit(X_train, y_train)
print("Best parameters found: ", RD.best_params_)
print("Best cross-validation score: {:.2f}".format(RD.best_score_))
# After hyperparameter tuning, retrain the model with the best parameters on the entire training set
best_model = GD.best_estimator_
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)
accuracy_best = metrics.accuracy_score(y_test, y_pred_best)
print(f"Model Accuracy after Hyperparameter Tuning: {accuracy_best*100:.2f}%")  


Best parameters found:  {'C': 3, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation score: 0.97
Best parameters found:  {'kernel': 'linear', 'gamma': 'auto', 'C': 3}
Best cross-validation score: 0.97
Model Accuracy after Hyperparameter Tuning: 96.67%


In [39]:
# save scaler 
joblib.dump(scaler, 'scaler.pkl')
# save the best model
joblib.dump(best_model, 'svm_best_model.pkl')
# load scaler
# loaded_scaler = joblib.load('scaler.pkl')
# load the best model
# loaded_best_model = joblib.load('svm_best_model.pkl')

['svm_best_model.pkl']