>Parthiv Desai

>CPCSC 483 - 03

>XGBOOST

#Importing the Libraries

In [None]:
import numpy as np  #Numpy is widely used library for handling number
import matplotlib.pyplot as plt #Provides a library for data visualization for plotting graphs
import pandas as pd  #Used for data manipulation

#Importing the dataset

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/Data-1.csv") #Pull the Data-1.csv file from CSV into a variable.
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:, -1].values

#Feature Scaling

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)


#Splitting the dataset into Training set and Testing set

In [None]:
from sklearn.model_selection import train_test_split #Importing train_test_split that is a method to split the specified dataset into training set and testing set.
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.25, random_state=0)

#Training the Kernel SVM model on the training set

In [None]:
from sklearn.svm import SVC #Importing SVC class from svm classifier.
classifier = SVC(kernel = 'rbf', random_state = 0) #Use the Radial basis function kernel. 
classifier.fit(X_train,y_train) #Fit method helps in training the classifier on the given training data.

#Doing the confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix,accuracy_score #Importing the confusing matrix and accuracy_score libraries
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test,y_pred) #The confusion matrix forms a table that summarizes the function and performance of the given algorithm of the test values. It successfully depicts true positive,negative and false positive, negative predictions.
print(cm) #Printing confusion matrix
accuracy = accuracy_score(y_test, y_pred) #Accuracy_score function is used for computing the accuracy of the previous classifier by getting accurate predictions and results are only the true positive and true negative.


[[107   0]
 [ 64   0]]


#Implementing xgboost Cross Validation

In [None]:
from xgboost import XGBClassifier #XGBoost is gradient boosting algorithm that is often used for classification and regression tasks because teratively adding decision trees to a model, with each new tree attempting to correct the errors of the previous trees.
classifier = XGBClassifier() #The XGBClassifier class in the xgboost library is a specific implementation of the XGBoost algorithm for classification tasks. 
classifier.fit(X_train, y_train) #The fit() method is used to train the model on the provided training data, where X_train represents the input features (independent variables) and y_train represents the corresponding output labels 

#Adding grid search to find the best accuracies and best parameters

In [None]:
from sklearn.model_selection import GridSearchCV #Implementing GridSearchCV library
parameters = [{'C': [0.25,0.5,0.75,1], 'kernel': ['linear']},
              {'C': [0.25,0.5,0.75,1], 'kernel': ['rbf'], 'gamma': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}]
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters, scoring = 'accuracy', cv = 10, n_jobs = -1) #The GridSearchCV class from the scikit-learn library is used for this purpose. It takes an estimator (classifier), a dictionary of hyperparameters to search over (parameters), a scoring metric (accuracy in this case), and the number of folds for cross-validation (cv = 10).
grid_search.fit(X_train,y_train) #, the best accuracy score and corresponding hyperparameters are obtained using the best_score_ and best_params_ attributes of the grid_search object, respectively.
best_accuracy = grid_search.best_score_ #The results are printed using the print() function, where the accuracy score is formatted to display two decimal places and the percentage sign.
best_parameters = grid_search.best_params_
print("Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:",best_parameters)

Parameters: { "C", "kernel" } are not used.

Accuracy: 97.07 %
Best Parameters: {'C': 0.25, 'gamma': 0.9, 'kernel': 'rbf'}


#Visualising the Training set results

In [None]:
from matplotlib.colors import ListedColormap

X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-NN (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

#Visualising the Test set results

In [None]:
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('K-NN (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()