In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from mpl_toolkits import mplot3d
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.datasets import load_iris
import seaborn as sns
from scipy.stats import kde

## ***EXPLORATORY DATA ANALYSIS***

In [None]:
iris = pd.read_csv('/content/drive/MyDrive/Iris.csv')

In [None]:
iris.keys()

In [None]:
iris.info()

In [None]:
iris.head()

In [None]:
iris.shape

In [None]:
iris.Species.value_counts()

# ***CLEANING THE DATA SET***

In [None]:
# missing values
missing = iris.isnull().sum()
print(missing)

# Check if there are any missing values
if missing.any():
    print("\nMissing values found in the dataset:")
    print(missing[missing > 0])  # Display only columns with missing values
else:
    print('\nThere are no missing values in the dataset')

In [None]:
# Check unique values
iris.nunique()

# ***DUPLICATES IDENTIFY***

In [None]:
# duplicate values
duplicates = iris.duplicated().sum()
print(f"Number of duplicate rows = {duplicates}")

# drop duplicates
print("After dropping duplicates")
iris.drop_duplicates(inplace=True)
print(f"Number of duplicate rows = {iris.duplicated().sum()}")

# ***PRE - PROCESSING THE DATA SET***

In [None]:
# Delete a column only if it exists
iris.drop(['Id'], axis=1, inplace=True)
iris.head()

# ***VISUALIZATIONS***

In [None]:
 # Create pairplot to visualize relationships between features
sns.pairplot(iris, hue='Species', diag_kind='hist')
plt.suptitle('Iris Features Pairplot', y=1.02)

In [None]:
sns.scatterplot(x='PetalLengthCm', y='PetalWidthCm', hue='Species', data=iris, palette='Set2')


# Add labels and title
plt.xlabel('Petal Length (cm)')
plt.ylabel('Petal Width (cm)')
plt.title('Scatter Plot of Petal Length vs. Petal Width')

# Show the plot
plt.show()

In [None]:
# Assuming 'iris' is your DataFrame and 'Species' is a column with species names
Species = iris['Species'].value_counts().reset_index()
Species.columns = ['Species', 'count']

# Create the pie chart
plt.figure(figsize=(8, 8))
plt.pie(Species['count'],
        labels=['Iris-setosa','Iris-versicolor','Iris-virginica'],  # Use actual species names here
        autopct='%1.1f%%',
        explode=[0.05, 0.05, 0.05],
        shadow=True,
        startangle=90)

plt.title('Distribution of Iris Flower Species')
plt.legend(title="Iris flower species", loc='upper left')
plt.show()

In [None]:
# Create correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(iris.drop(columns=['Species']).corr(), annot=True, cmap='coolwarm') # Drop the 'species' column
plt.title('Feature Correlation Matrix')

In [None]:
iris.hist()
plt.show()

## ***ENCODE THE TARGET VARABLE***

In [None]:
# Encode the target variable
iris['Species'] = iris['Species'].astype('category').cat.codes

## ***SPLIT THE DATA SET***

In [None]:
# Split the dataset into features and target
X = iris.drop(['Species'], axis=1)
y = iris['Species']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train.shape

### ***Train and Evaluate Machine Learning Models***










# ***LOGISTIC REGRESSION***

In [None]:
from sklearn.preprocessing import StandardScaler


In [None]:
# Scale the data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define the hyperparameter search space
param_dist = {
	'C': np.logspace(-4, 4, 100), # Range of C values in logarithmic scale
	'penalty': ['l2'], # Only 'l2' penalty is compatible with 'lbfgs' solver
	'solver': ['lbfgs'] # Use only 'lbfgs' solver
}

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
# Create a Logistic Regression model
logistic_regression = LogisticRegression(max_iter=1000)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Perform RandomizedSearchCV with error_score='raise'
random_search = RandomizedSearchCV(logistic_regression, param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy', random_state=42, error_score='raise')
random_search.fit(X_train_scaled, y_train)

In [None]:
# Get the best hyperparameters
best_params = random_search.best_params_

In [None]:
# Fit the model with the best hyperparameters on the entire dataset
best_model = random_search.best_estimator_
best_model.fit(X_train_scaled, y_train)

In [None]:
y_pred=best_model.predict(X_test_scaled)

In [None]:
# Convert y_test to a NumPy array
y_test_array = y_test.to_numpy()

# Reshape y_pred and y_test to have the same dimension
y_pred_reshaped = y_pred.reshape(len(y_pred),1)
y_test_reshaped = y_test_array.reshape(len(y_test_array),1)

# Concatenate the reshaped arrays
result = np.concatenate((y_pred_reshaped, y_test_reshaped), axis=1)

print(result)

In [None]:
# Evaluate the best model on the test set
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
cm=confusion_matrix(y_test, y_pred)
print(cm)
accuracy=accuracy_score(y_test, y_pred)

print(f"Best Hyperparameters: {best_params}")
print(f"Accuracy on test set: {accuracy:.2f}")


In [None]:
from sklearn.metrics import precision_score

# Calculate precision
precision = precision_score(y_test, y_pred, average='weighted')

print("Precision:", precision)

In [None]:
!pip install scikit-learn
from sklearn.preprocessing import LabelEncoder

# ... (rest of your code) ...

# Before plotting the confusion matrix, initialize and fit LabelEncoder:
label_encoder = LabelEncoder()
label_encoder.fit(iris['Species'])  # Fit to the original 'Species' column before encoding

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from sklearn.metrics import classification_report

# Calculate the classification report
report = classification_report(y_test, y_pred)

print(report)

# ***K-NEAREST NEIGHBORS (KNN)***

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV # Import RepeatedStratifiedKFold and GridSearchCV

In [None]:
# define models and parameters
model = KNeighborsClassifier()
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
# define grid search
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Fitting the KNeighborsClassifier with the best parameters
best_params = grid_result.best_params_
model_KNN = KNeighborsClassifier(**best_params)
model_KNN.fit(X_train_scaled, y_train)

In [None]:
# Predictions on the training set
y_pred_KNN = model_KNN.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred_KNN)
print("Test Accuracy:", test_accuracy)

# Predictions on the test set
y_train_pred_KNN = model_KNN.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_pred_KNN)
print("Training Accuracy:", train_accuracy)

In [None]:
# Confusion matrix on test set
cm = confusion_matrix(y_pred_KNN, y_test)
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Calculate precision
precision = precision_score(y_test, y_pred_KNN, average='weighted')

print("Precision:", precision)

In [None]:

# Calculate the classification report
report = classification_report(y_test, y_pred_KNN)

print(report)

# ***support vector machine***  (svm)

In [None]:
from sklearn.svm import SVC

In [None]:
# define model and parameters
model = SVC()
kernel = ['poly', 'rbf', 'sigmoid']
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
# define grid search
grid = dict(kernel=kernel,C=C,gamma=gamma)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
# Fitting the SVM  with the best parameters
best_params = grid_result.best_params_
model_SVM = SVC(**best_params)
model_SVM.fit(X_train_scaled, y_train)

In [None]:
# Predictions on the training set
y_pred_SVM = model_SVM.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred_SVM)
print("Test Accuracy:", test_accuracy)

# Predictions on the test set
y_train_pred_SVM = model_SVM.predict(X_train_scaled)
train_accuracy = accuracy_score(y_train, y_train_pred_SVM)
print("Training Accuracy:", train_accuracy)

In [None]:
# Confusion matrix on test set
cm = confusion_matrix(y_test, y_pred_SVM)
sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Calculate precision
precision_SVM = precision_score(y_test, y_pred_SVM, average='weighted')

print("Precision:", precision_SVM)

In [None]:
from sklearn.metrics import recall_score # Import recall_score

# Change the average parameter to 'micro', 'macro', or 'weighted'
recall_score(y_test, y_pred_SVM, average='micro')

In [None]:
# Calculate the classification report
report = classification_report(y_test, y_pred_SVM)

print(report)

# ***CHOOSE THE BEST MODEL***

In [None]:
# Initialize an empty dictionary to store model predictions
model_predictions = {}

# Assuming you have predictions from models named 'Logistic Regression', 'KNN', and 'SVM'
# Store the predictions in the dictionary
model_predictions['Logistic Regression'] = y_pred # Assuming 'y_pred' is from Logistic Regression
model_predictions['KNN'] = y_pred_KNN
model_predictions['SVM'] = y_pred_SVM

# Calculate and store accuracy scores
accuracy_scores = {}
for model_name, y_pred in model_predictions.items():
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores[model_name] = accuracy

In [None]:
# Find the best model
best_model_name = max(accuracy_scores, key=accuracy_scores.get)

# Print the best model and its accuracy
print(f"The best model is: {best_model_name} with an accuracy of {accuracy_scores[best_model_name]:.2f}")

# ***Conclusion***
Both Logistic Regression and KNN achieved an accuracy of 1.0 on the Iris dataset, indicating that they perfectly classified all instances in the test set.The code evaluates both models and prints their accuracies, classification reports, and confusion matrices.
It concludes by comparing the accuracies and suggests the best model based on the criteria discussed.
This structured approach allows you to effectively compare the performance of Logistic Regression and KNN on the Iris dataset while considering their computational efficiency and suitability for the problem at hand.