<a href="https://colab.research.google.com/github/nowshine-sharmili-piuli/Artificial-Intelligence-Machine-Learning-Gen-AI-Concept-and-Applications/blob/main/KNN_C223285.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><b>K-Nearest Neighbors</b></h1>

Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

Step 2: Load the dataset

In [None]:
data=pd.read_csv("/content/sample_data/spam_ham_dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [None]:
data.shape

(5171, 4)

Step 3: Preparing Features (X) and Target (Y)

In [None]:
X=data['text']
Y=data['label']

print("Features (X):")
print(X.head())
print("\nTarget (Y):")
print(Y.head())

Features (X):
0    Subject: enron methanol ; meter # : 988291\r\n...
1    Subject: hpl nom for january 9 , 2001\r\n( see...
2    Subject: neon retreat\r\nho ho ho , we ' re ar...
3    Subject: photoshop , windows , office . cheap ...
4    Subject: re : indian springs\r\nthis deal is t...
Name: text, dtype: object

Target (Y):
0     ham
1     ham
2     ham
3    spam
4     ham
Name: label, dtype: object


Step 4: Split data into Training and Testing Sets

In [None]:
#60% for training ,40% for testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=50)

print(f'Training sets: {len(X_train)}')
print(f'Testing sets: {len(X_test)}\n')


Training sets: 3102
Testing sets: 2069



Step 5: Scale the Features<b>(Important for KNN)</b>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Create a TfidfVectorizer to convert text into numerical features
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data and transform both training and testing data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Create a StandardScaler (using with_mean=False to handle sparse matrices efficiently if needed)
scaler = StandardScaler(with_mean=False)

# Fit the scaler on the vectorized training data
X_train_Scaled = scaler.fit_transform(X_train_vectorized)

# Transform the vectorized test data using the same scaler
X_test_Scaled = scaler.transform(X_test_vectorized)

print("Before scaling (first document original text):")
print(X_train.iloc[0])

print("\n\nAfter scaling (first document, first 10 features of the scaled vector):")
# Convert to dense array for printing a portion if it's a sparse matrix
print(X_train_Scaled[0].toarray()[0, :10] if hasattr(X_train_Scaled[0], 'toarray') else X_train_Scaled[0][:10])

print("\n\n\nAfter scaling (first test document, first 10 features of the scaled vector):")
print(X_test_Scaled[0].toarray()[0, :10] if hasattr(X_test_Scaled[0], 'toarray') else X_test_Scaled[0][:10])

Before scaling (first document original text):
minnesota , which can clinch a wild - card
playoff spot with a loss by either carolina or st . louis this weekend , appeared on
its way to retaking the lead . but a holding penalty on birk - - the vikings were
flagged nine times for 78 yards - - wiped out a 16 - yard run by michael bennett that
the vikings ( 8 - 7 ) , though , couldn ' t
get what they needed from a pass defense that has struggled all season .
government spokesman raanan gissin
said four soldiers were killed .
six people were taken to hospital - -
four badly hurt , one with moderate injuries and one lightly injured , military
sources said .
the sources said another soldier
remained beneath the rubble .
gissin said rescue operations were
continuing sunday night .
the attack " indicates that unless
there is decisive and sustained effort taken to dismantle the terrorist
organization , it will be impossible to move towards normalizations and towards
political 

Step 6: Create and Train the KNN Model

In [None]:
#KNN model with K=10
model=KNeighborsClassifier(n_neighbors=10)

#Train the model
model.fit(X_train_Scaled,Y_train)
print("KNN Model trained successfully")
print("Number of neighbors(K) = ",model.n_neighbors)

KNN Model trained successfully
Number of neighbors(K) =  10


Step 7: Make Predictions

In [None]:
#predict on scaled Data
Y_pred=model.predict(X_test_Scaled)

print("Predicted values:", list(Y_pred))
print("Actual values:", list(Y_test))

Predicted values: ['spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'ham', 'spam', 'ham', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'ham', 'spam', 'spam', 'spam', 'spam', 'spam', 'spam', 'spa

Step 8: Evaluate the Model

In [None]:
#calculate accuracy
accuracy= accuracy_score(Y_test,Y_pred)
print("Accuracy:",round(accuracy*100,2), "%")

Accuracy: 38.91 %


In [None]:
#confusion matrix
print("Confusion Matrix:")
confusion_matrix(Y_test,Y_pred)

Confusion Matrix


array([[ 208, 1264],
       [   0,  597]])

In [None]:
#detailed Classification report
print("Classification Report:")
print(classification_report(Y_test,Y_pred))

Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.14      0.25      1472
        spam       0.32      1.00      0.49       597

    accuracy                           0.39      2069
   macro avg       0.66      0.57      0.37      2069
weighted avg       0.80      0.39      0.32      2069



Step 9: Find the Best K value

In [None]:
#try diff K values and work best
print("Testing different K values: \n")

for k in [2,3,4,6,34,5,7,9,1,8]:
  #Create model
  tem_model=KNeighborsClassifier(n_neighbors=k)

  #Train
  tem_model.fit(X_train_Scaled,Y_train)

  #predict
  tem_pred=tem_model.predict(X_test_Scaled)

  #calculate accuracy
  tem_accuracy=accuracy_score(Y_test,Y_pred)

  print("K =",k,", Accuracy:",round(tem_accuracy*100,2), "%")

Testing different K values: 

K = 2 , Accuracy: 38.91 %
K = 3 , Accuracy: 38.91 %
K = 4 , Accuracy: 38.91 %
K = 6 , Accuracy: 38.91 %
K = 34 , Accuracy: 38.91 %
K = 5 , Accuracy: 38.91 %
K = 7 , Accuracy: 38.91 %
K = 9 , Accuracy: 38.91 %
K = 1 , Accuracy: 38.91 %
K = 8 , Accuracy: 38.91 %


Step 10: Predict for a new email

In [None]:
#new email
new_mail = [
    "Subject: Claim your prize now! You have won a lottery. Click here to get your reward.",
    "Subject: Meeting reminder for tomorrow at 10 AM. Please review the attached document."
]

new_mail_vectorized = vectorizer.transform(new_mail)
new_mail_scaled = scaler.transform(new_mail_vectorized)

#predict
prediction=model.predict(new_mail_scaled)
print(prediction)

if prediction[0] == 'spam':
    print("Prediction: mail is Spam")
else:
    print("Prediction: mail is Ham")

['spam' 'spam']
Prediction: mail is Spam
