# Classification template

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns   #visualisation
import matplotlib.pyplot as plt    #visualisation
%matplotlib inline
#to display the plots immediatedly below the code
sns.set(color_codes=True)
#to enable us to use shorthand color codes

## Load dataset

**Read dataset from drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# After executing the cell above, Drive
# files will be present in "/content/drive/My Drive".
!ls "/content/drive/My Drive/datasets"

In [None]:
data=pd.read_csv('drive/My Drive/datasets/(dataset_name)')

**Read dataset from github**

In [None]:
#pd.read_csv('github link')
data=pd.read_csv('https://raw.githubusercontent.com/amankharwal/Website-data/master/IRIS.csv')

**Import and read excel data**

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import io
data=pd.read_excel(io.BytesIO(uploaded['(file_name).xlsx']))
data

## Exploratory data analysis

In [None]:
data.head() #to show the first five entries in the dataset

In [None]:
data.tail() #to show the last entries in the dataset  default=5

In [None]:
data.dtypes #check the types of data present

In [None]:
print(data.shape,'\n')
print(data.describe) #all the details about the dataset

In [None]:
data['column_name'].count()

## Data Preprocessing

**Dropping irrelevant columns**

In [None]:
data=data.drop(['Id'],axis=1)   #axis=1 implies we are dropping a coloumn
data.head()

**Renaming the rows**

In [None]:
#this can be used to give the columns short alternative names so they are easier to access
data=data.rename(columns={'SepalLengthCm':'SL','SepalWidthCm':'SW','PetalLengthCm':'PL','PetalWidthCm':'PW'})  #change the names according to the dataset imported
data.head()

**Dropping duplicate rows**

In [None]:
if data.duplicated().any():
  data=data.drop_duplicates()

**Dropping null values**

data.isna(): Returns a DataFrame of the same shape as data, with True indicating the presence of a NaN in each cell.

data.isna().any(): For each column, it checks if there are any True values (i.e., any NaN values). This results in a Series where each entry is True if the column contains at least one NaN.

data.isna().any().any(): The second .any() checks across the entire Series of columns, returning True if any column has at least one missing value. Essentially, this checks if there's any missing value in the entire DataFrame.

data.dropna(): If any missing value is found, dropna() is called to remove all rows containing NaNs.

In [None]:
if data.isna().any().any():
  data=data.dropna()

**OR Run this to see what each function does**

In [None]:
#data.isna()

In [None]:
#data.isna().any()

In [None]:
#na=data.isna().any().any()

In [None]:
#if na:
  data = data.dropna()

**Classifying the data**

In [None]:
#finding the different values the target variable can take
data['(Target_variable)'].unique() #replace target_variable with the actual name of the column you want to set

In [None]:
data['(Target_variable)']=data['(Target_variable)'].replace({'(Target_variable_values)':1, '(Target_variable_values)':2, '(Target_variable_values)':3})
data.tail()

## Data Visualisation

In [None]:
print(data["(Target_variable)"].value_counts())
plot=sns.FacetGrid(data, hue="Species", height=5).map(plt.scatter, "(column_name)", "(column_name)").add_legend()
plot.set_xlabels('(label)') #set the label name
plot.set_ylabels('(label)') #set the label name
plot

**Plotting different features against one another**

In [None]:
plt.scatter(data['PL'],data['Species'].astype('category').cat.codes,c=data['Species'].astype('category').cat.codes,cmap='rainbow')
plt.xlabel('PetalLengthCm')
plt.ylabel('Species')
plt.show()

In [None]:
plt.scatter(data['SL'],data['Species'].astype('category').cat.codes,c=data['Species'].astype('category').cat.codes,cmap='rainbow')
plt.xlabel('SepalLengthCm')
plt.ylabel('Species')
plt.show()

## Test-Train-Split

**This function randomly splits the data into the desired proportions (e.g., 80% for training, 20% for testing).**

In [None]:
#importing necessary modules and functions
from sklearn.model_selection import train_test_split

In [None]:
# Features and target variable
x = data.drop('target_variable', axis=1) #enter target variable here and in the code below
y = data['target_variable']


In [None]:
## Split the data
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=1,test_size=0.2)
x_train.count(),x_test.count(),y_train.count(),y_test.count()

In [None]:
x_train.head(),y_train.head()

## Model Selection and Training

**Different models have different strengths and weaknesses, and selecting the right one can significantly impact performance.**

**Logistic Regression**

Logistic regression is a supervised machine learning algorithm used for classification tasks where the goal is to predict the probability that an instance belongs to a given class or not. Logistic regression is a statistical algorithm which analyze the relationship between two data factors.

import required modules

In [None]:
from sklearn.linear_model import LogisticRegression

Training

In [None]:
#model = LogisticRegression(max_iter=300)    #uncomment this block if logistic regression is the model chosen
#model.fit(x_train,y_train)

**Support Vector Machine**

Support Vector Machine (SVM) is a supervised machine learning algorithm used for both classification and regression. Though we say regression problems as well it’s best suited for classification. The main objective of the SVM algorithm is to find the optimal hyperplane in an N-dimensional space that can separate the data points in different classes in the feature space

In [None]:
from sklearn import svm       #uncomment this block if svm is the model chosen

In [None]:
#model=svm.SVC(kernel='linear')

In [None]:
#model.fit(x_train,y_train)

**Decision Tree**

A decision tree is a tree-like model used in machine learning and decision analysis to visually and explicitly represent decisions and their possible consequences.

In [None]:
from sklearn.tree import DecisionTreeClassifier    #uncomment this block if logistic regression is the model chosen

In [None]:
## Create a decision tree classifier
#model = DecisionTreeClassifier()

In [None]:
## Train the classifier on the training data
#model.fit(x_train, y_train)

## Model Prediction and Evaluation

### Decision Tree visualisation

In [None]:
from sklearn.tree import plot_tree # Import plot_tree from sklearn.tree
import matplotlib.pyplot as plt    # Import matplotlib.pyplot as plt


In [None]:
plt.figure(figsize=(14, 9))

In [None]:
# Get feature and target column names
feature_names = data.columns[:-1] # Assuming the last column is the target
target_names = data['Species'].unique() # Replace 'target_column' with your actual target column name

In [None]:
plot_tree(model, feature_names=feature_names, class_names=target_names, filled=True, rounded=True)
plt.show()

### SVM Intuition

While plotting the decision function of classifiers for toy 2D datasets can help get an intuitive understanding of their respective expressive power, be aware that those intuitions don’t always generalize to more realistic high-dimensional problems.

We only consider the first 2 features of this dataset:

Sepal length

Sepal width

This code is an example and loads ,and works on, the iris dataset that is already available in sklearn library.

In [None]:
import matplotlib.pyplot as plt

from sklearn import datasets, svm
from sklearn.inspection import DecisionBoundaryDisplay

## import some data to play with
iris = datasets.load_iris()
## Take the first two features. We could avoid this by using a two-dim dataset
X = iris.data[:, :2]
y = iris.target

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
models = (
    svm.SVC(kernel="linear", C=C),
    svm.LinearSVC(C=C, max_iter=10000)
)
models = (clf.fit(X, y) for clf in models)

# title for the plots
titles = (
    "SVC with linear kernel",
    "LinearSVC (linear kernel)"
)

# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(1, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X[:, 0], X[:, 1]

for clf, title, ax in zip(models, titles, sub.flatten()):
    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax,
        xlabel=iris.feature_names[0],
        ylabel=iris.feature_names[1],
    )
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()

Here we are working with a dataset that can be uploaded from various sources as shown above.

In [None]:
import matplotlib.pyplot as plt

from sklearn import datasets, svm
from sklearn.inspection import DecisionBoundaryDisplay

#taking the first two features
X = data.iloc[:, :2]
y = data.iloc[:, -1]

# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors
C = 1.0  # SVM regularization parameter
models = (
    svm.SVC(kernel="linear", C=C),
    svm.LinearSVC(C=C, max_iter=10000)
)
models = (clf.fit(X, y) for clf in models)

# title for the plots
titles = (
    "SVC with linear kernel",
    "LinearSVC (linear kernel)"
)

# Set-up 2x2 grid for plotting.
fig, sub = plt.subplots(1, 2)
plt.subplots_adjust(wspace=0.4, hspace=0.4)

X0, X1 = X.iloc[:, 0], X.iloc[:, 1]

for clf, title, ax in zip(models, titles, sub.flatten()):
    disp = DecisionBoundaryDisplay.from_estimator(
        clf,
        X,
        response_method="predict",
        cmap=plt.cm.coolwarm,
        alpha=0.8,
        ax=ax,
        xlabel=(data.columns[0]),
        ylabel=(data.columns[1]),
    )
    ax.scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
    ax.set_xticks(())
    ax.set_yticks(())
    ax.set_title(title)

plt.show()

### Evaluation Techniques

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [None]:
# Make predictions on the test data
y_pred = clf.predict(x_test)

In [None]:
print(y_pred)

In [None]:
print(y_test)

Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score
# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)

print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", cv_scores.mean())

Accuracy

In [None]:
# Evaluate the performance of the classifier
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Precision

In [None]:
precision = metrics.precision_score(y_test, y_pred, average='weighted') #for multi-class
print(f"Precision: {precision}")

Recall(Sensitivity)

In [None]:
recall = metrics.recall_score(y_test, y_pred, average='weighted') #for multi-class
print(f"Recall: {recall}")

F1-Score

In [None]:
f1 = metrics.f1_score(y_test, y_pred, average='weighted') #for multi-class
print(f"F1-Score: {f1}")

ROC Curve

In [None]:
from sklearn.metrics import roc_curve, auc

# Predict probabilities
y_probs = model.predict_proba(X_test)[:, 1]

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

Confusion Matrix

In [None]:
#confusion matrix summarizes the performance of a machine learning model ona set of test data
c_matrix=confusion_matrix(y_test,y_pred)
c_matrix

We visualise it using a heat map

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()