## K-Means Clustering (Library)

In [None]:
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
import matplotlib.pyplot as plt

# Load the data from CSV file
data = pd.read_csv("Mall_Customers.csv")

# Extract the required columns
X = data[['Annual Income (k$)', 'Spending Score (1-100)']]

# Determine the number of clusters (you can modify this)
k = 5

# Perform K-means clustering
kmeans = KMeans(n_clusters=k)
kmeans.fit(X)
kmeans_labels = kmeans.labels_
kmeans_centroids = kmeans.cluster_centers_

# Perform Agglomerative clustering
agglomerative = AgglomerativeClustering(n_clusters=k)
agglomerative.fit(X)
agglomerative_labels = agglomerative.labels_

# Visualize the results
plt.scatter(X['Annual Income (k$)'], X['Spending Score (1-100)'],
            c=kmeans_labels, cmap='viridis')
plt.scatter(kmeans_centroids[:, 0],
            kmeans_centroids[:, 1], marker='X', color='red')
plt.title('K-means Clustering')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

plt.scatter(X['Annual Income (k$)'], X['Spending Score (1-100)'],
            c=agglomerative_labels, cmap='viridis')
plt.title('Agglomerative Clustering')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()


## K-Means Clustering (No Library)

In [None]:
import numpy as np

X = np.array([[1.1, 1.1], [1.5, 2.1], [3.1, 4.1], [
             5.1, 7.1], [3.5, 5.1], [4.5, 5.1], [3.5, 4.5]])

k = 2

centroids = X[:k]

labels = np.zeros(len(X))
distances = np.zeros((len(X), k))

max_iter = 100

for i in range(max_iter):

    for j in range(k):
        distances[:, j] = np.linalg.norm(X - centroids[j], axis=1)

    labels = np.argmin(distances, axis=1)

    for j in range(k):
        centroids[j] = np.mean(X[labels == j], axis=0)

    if np.all(labels == np.argmin(distances, axis=1)):
        break

print("Labels:", labels)
print("Centroids:", centroids)


## K-Means Clustering (No Library) - 3 Features


In [None]:
data = [
    [2.5, 4.5, 5.4],
    [3.8, 6.3, 9.8],
    [9.5, 8.6, 6.8],
    [4.7, 8.8, 4.2],
    [5.5, 3.1, 9.9],
    [2.1, 1.8, 7.8]
]

# Initial centroids (choosing the first two points from the dataset)
centroids = [[2.5, 4.5, 5.4], [3.8, 6.3, 9.8]]


def calculate_distance(point1, point2):
    return sum((a - b) ** 2 for a, b in zip(point1, point2)) ** 0.5


def calculate_mean(points):
    return [
        sum(point[i] for point in points) / len(points)
        for i in range(len(points[0]))
    ]


for _ in range(10):
    clusters = [[], []]
    for point in data:
        distances = [calculate_distance(point, centroid)
                     for centroid in centroids]
        closest_centroid = distances.index(min(distances))
        clusters[closest_centroid].append(point)
    centroids = [calculate_mean(cluster) for cluster in clusters]


print("Final Centroids:", centroids)

for i in range(len(centroids)):
    cluster_points = clusters[i]
    cluster_centroid = centroids[i]
    print(f"Cluster {i+1} points: {cluster_points}")
    print(f"Cluster {i+1} centroid: {cluster_centroid}")
    print()


## K-Means Clustering (With Library) - 3 Features

In [None]:
from sklearn.cluster import KMeans

# Given dataset
data = [
    [2.5, 4.5, 5.4],
    [3.8, 6.3, 9.8],
    [9.5, 8.6, 6.8],
    [4.7, 8.8, 4.2],
    [5.5, 3.1, 9.9],
    [2.1, 1.8, 7.8]
]

# Create a KMeans instance with 2 clusters
kmeans = KMeans(n_clusters=2, random_state=0)

# Fit the model to the data
kmeans.fit(data)

# Get the cluster assignments for each data point
cluster_assignments = kmeans.labels_

# Get the coordinates of the final centroids
final_centroids = kmeans.cluster_centers_

# Output the results
print("Final Centroids:", final_centroids)

for i in range(2):
    points_in_cluster = [point for point, cluster in zip(
        data, cluster_assignments) if cluster == i]
    print(f"Cluster {i+1} points: {points_in_cluster}")
    print(f"Cluster {i+1} centroid: {final_centroids[i]}")
    print()


## PCA (No Library)

In [None]:
import numpy as np

X = np.random.randint(10, 50, 60).reshape(10, 6)
print("Original Data: \n", X)

# Standardizing the features
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
print("\n Standardized Data: \n", X)

# Computing the covariance matrix
covariance_matrix = np.cov(X.T)
print("\n Covariance Matrix: \n", covariance_matrix)


# Computing the eigenvectors and eigenvalues
eigenvalues, eigenvectors = np.linalg.eig(covariance_matrix)
print("\n Eigenvectors: \n", eigenvectors)
print("\n Eigenvalues: \n", eigenvalues)


# Make a list of (eigenvalue, eigenvector) tuples
eigenpairs = [(np.abs(eigenvalues[i]), eigenvectors[:, i])
              for i in range(len(eigenvalues))]

# Sort the (eigenvalue, eigenvector) tuples from high to low
eigenpairs.sort(key=lambda x: x[0], reverse=True)

print("\n Eigenpairs sorted by decreasing eigenvalues: \n", eigenpairs)


# We're choosing the top 2 eigenvectors
matrix_w = np.hstack((eigenpairs[0][1].reshape(6, 1),
                      eigenpairs[1][1].reshape(6, 1)))

print('\n Matrix W:\n', matrix_w)


# Transforming the original dataset
X_pca = X.dot(matrix_w)
print("\n Transformed Data: \n", X_pca)


## Outlier Detection

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('onlineretail.csv', encoding='ISO-8859-1')
sns.boxplot(x=df['Quantity'])
plt.title("Qammar Mehmood 093")

plt.show()

# Define a function to remove outliers using IQR(Inter Quantile Range)


def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR)))]


# Remove outliers from 'Quantity'
df = remove_outliers(df, 'Quantity')

# Plotting the boxplot after removing outliers
sns.boxplot(x=df['Quantity'])
plt.title("Qammar Mehmood 093")
plt.show()


## Perceptron (OR Gate)

In [None]:
import numpy as np

# Step function


def step_function(x):
    return np.where(x >= 0, 1, 0)

# Perceptron function


def perceptron(inputs, weights, bias):
    weighted_sum = np.dot(inputs, weights) + bias
    return step_function(weighted_sum)


# OR gate training data
training_data = np.array([
    [0, 0, 0],
    [0, 1, 1],
    [1, 0, 1],
    [1, 1, 1]
])

# Initialize weights and bias
weights = np.array([0.1, 0.1])
bias = 0.2
learning_rate = 0.1

# Train the perceptron
for _ in range(100):
    for data in training_data:
        inputs = data[:2]
        target_output = data[2]

        # Compute the output
        output = perceptron(inputs, weights, bias)

        # Update the weights and bias
        error = target_output - output
        weights += learning_rate * error * inputs
        bias += learning_rate * error

# Test the perceptron
test_data = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
])

for data in test_data:
    inputs = data
    output = perceptron(inputs, weights, bias)
    print(f"Input: {inputs}, Output: {output}")


## Perceptron (AND Gate)

In [None]:
import numpy as np

# Step function


def step_function(x):
    return np.where(x >= 0, 1, 0)

# Perceptron function


def perceptron(inputs, weights, bias):
    weighted_sum = np.dot(inputs, weights) + bias
    return step_function(weighted_sum)


# AND gate training data
training_data = np.array([
    [0, 0, 0],
    [0, 1, 0],
    [1, 0, 0],
    [1, 1, 1]
])

# Initialize weights and bias
weights = np.array([0.1, 0.1])
bias = 0.2
learning_rate = 0.1

# Train the perceptron
for _ in range(100):
    for data in training_data:
        inputs = data[:2]
        target_output = data[2]

        # Compute the output
        output = perceptron(inputs, weights, bias)

        # Update the weights and bias
        error = target_output - output
        weights += learning_rate * error * inputs
        bias += learning_rate * error

# Test the perceptron
test_data = np.array([
    [0, 0],
    [0, 1],
    [1, 0],
    [1, 1]
])

for data in test_data:
    inputs = data
    output = perceptron(inputs, weights, bias)
    print(f"Input: {inputs}, Output: {output}")


## MLP (Binary Classification)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from tensorflow import keras
from scikeras.wrappers import KerasClassifier
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt

# Load the data
data = np.loadtxt('pima-indians-diabetes.csv', delimiter=',')

# Split into features and target
X = data[:, 0:8]
y = data[:, 8]

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Perform data preprocessing: feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the model
model = keras.Sequential([
    keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
])


# Visualize the model architecture
plot_model(model, show_shapes=True, show_layer_names=True,
           to_file='model_architecture.png')

# Display the model architecture in the notebook
model_img = plt.imread('model_architecture.png')
plt.figure(figsize=(10, 10))
plt.imshow(model_img)
plt.title("Qammar Mehmood 01-134202-093")
plt.axis('off')
plt.show()

# Compile the model
model.compile(optimizer='sgd', loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X_train_scaled, y_train, epochs=10, batch_size=32)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test_scaled, y_test)

print(f'Test Loss: {test_loss:.5f}')
print(f'Test Accuracy: {test_acc:.5f}')


## MLP (Multiple Classification)

In [None]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('car_data.csv')

# Preprocess the data
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Encoding categorical features
label_encoder = LabelEncoder()
for i in range(X.shape[1]):
    X[:, i] = label_encoder.fit_transform(X[:, i])


# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)
le = LabelEncoder()
y = le.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Define the MLP architecture
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu',
                          input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])


# Visualize the model architecture
plot_model(model, show_shapes=True, show_layer_names=True,
           to_file='model_architecture.png')

# Display the model architecture in the notebook
model_img = plt.imread('model_architecture.png')
plt.figure(figsize=(10, 10))
plt.imshow(model_img)
plt.title("Qammar Mehmood 01-134202-093")
plt.axis('off')
plt.show()


# Compile the model
model.compile(optimizer='sgd',
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, verbose=1)

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

print(f'Test Loss: {test_loss:.5f}')
print(f'Test Accuracy: {test_acc:.5f}')


## KNN (No Library)

In [None]:
import numpy as np

# Define the dataset
data = [
    ['Small', 'Green', 'Irregular', 'No'],
    ['Large', 'Red', 'Irregular', 'Yes'],
    ['Large', 'Red', 'Circle', 'Yes'],
    ['Large', 'Green', 'Circle', 'No'],
    ['Large', 'Green', 'Irregular', 'No'],
    ['Small', 'Red', 'Circle', 'Yes'],
    ['Large', 'Green', 'Irregular', 'No'],
    ['Small', 'Red', 'Irregular', 'No'],
    ['Small', 'Green', 'Circle', 'No'],
    ['Large', 'Red', 'Circle', 'Yes']
]

# Separate features and labels
features = np.array([row[:-1] for row in data])
labels = np.array([row[-1] for row in data])

# Define function to calculate Hamming distance


def hamming_distance(instance1, instance2):
    return sum(el1 != el2 for el1, el2 in zip(instance1, instance2))

# Define function to classify new instance using KNN


def knn_classify(k, train_features, train_labels, test_instance):
    distances = []
    # Calculate Hamming distance between test instance and each training instance
    for i in range(len(train_features)):
        dist = hamming_distance(train_features[i], test_instance)
        distances.append((dist, train_labels[i]))
    # Sort distances in ascending order
    distances.sort(key=lambda x: x[0])
    # Select k nearest neighbors
    k_nearest = distances[:k]
    labels = [neighbor[1] for neighbor in k_nearest]
    return max(set(labels), key=labels.count)


# Test the KNN classifier
k = 3
test_instance = np.array(['Small', 'Red', 'Circle'])
predicted_label = knn_classify(k, features, labels, test_instance)
print("Predicted class for the test instance:",
      test_instance, " : ", predicted_label)


## KNN (Library)

In [None]:
from sklearn.neighbors import KNeighborsClassifier   # using sklearn library
from sklearn.preprocessing import OrdinalEncoder     # using sklearn library

# Define the dataset
data = [
    ['<=30', 'High', 'No', 'Fair', 'No'],
    ['<=30', 'High', 'No', 'Excellent', 'No'],
    ['31-40', 'High', 'No', 'Fair', 'Yes'],
    ['>40', 'Medium', 'No', 'Fair', 'Yes'],
    ['>40', 'Low', 'Yes', 'Fair', 'Yes'],
    ['>40', 'Low', 'Yes', 'Excellent', 'No'],
    ['31-40', 'Low', 'Yes', 'Excellent', 'Yes'],
    ['<=30', 'Medium', 'No', 'Fair', 'No'],
    ['<=30', 'Low', 'Yes', 'Fair', 'Yes'],
    ['>40', 'Medium', 'Yes', 'Fair', 'Yes'],
    ['<=30', 'Medium', 'Yes', 'Excellent', 'Yes'],
    ['31-40', 'Medium', 'No', 'Excellent', 'Yes'],
    ['31-40', 'High', 'Yes', 'Fair', 'Yes'],
    ['>40', 'Medium', 'No', 'Excellent', 'No']
]

# Separate features and labels
features = [row[:-1] for row in data]
labels = [row[-1] for row in data]

# Encode categorical features using OrdinalEncoder
encoder = OrdinalEncoder()
features_encoded = encoder.fit_transform(features)

# Define the KNN classifier
k = 3
knn = KNeighborsClassifier(n_neighbors=k)

# Fit the classifier to the training data
knn.fit(features_encoded, labels)

# Define the test instance
test_instance = [['<=30', 'Medium', 'Yes', 'Fair']]

# Encode the test instance using the fitted encoder
test_instance_encoded = encoder.transform(test_instance)

# Predict the class label for the test instance
predicted_label = knn.predict(test_instance_encoded)

print("Predicted class for the test instance:",
      test_instance, " : ", predicted_label[0])


## Naive Bayes (Dataset)

In [None]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB

# Create the dataset
dataSetStudent = {
    "age": ["<=30", "<=30", "30...40", ">40", ">40", ">40", "31...40", "<=30", "<=30", ">40", "<=30", "31...40", "31...40", ">40"],

    "income": ["high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"],

    "student": ["no", "no", "no", "no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"],

    "credit_rating": ["fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"],

    "buys_computer": ["no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"]
}
# print data in tabular form
print("Data:")
print(pd.DataFrame(dataSetStudent, columns=[
      "age", "income", "student", "credit_rating", "buys_computer"], index=range(1, 15), dtype="category", copy=True))

dataFrame = pd.DataFrame(dataSetStudent)

# One-hot encode the categorical features
dataFrame = pd.get_dummies(dataFrame)

# Split the dataset into features and target variable
features = dataFrame.drop(["buys_computer_no", "buys_computer_yes"], axis=1)
y = dataFrame["buys_computer_no"]

# Create an instance of the Naïve Bayes classifier
model = GaussianNB()

# Train the model
model.fit(features, y)

# Define the test instance
test_instance = {
    "age_<=30": 1,
    "age_30...40": 0,
    "age_>40": 0,
    "income_high": 0,
    "income_low": 0,
    "income_medium": 1,
    "student_no": 0,
    "student_yes": 1,
    "credit_rating_excellent": 0,
    "credit_rating_fair": 1
}

# Convert the test instance to a DataFrame
test_dataframe = pd.DataFrame([test_instance])

features_with_value_1 = test_dataframe.columns[test_dataframe.iloc[0] == 1].tolist(
)

print("\nTest Instances : ")
for feature in features_with_value_1:
    print("\t\t", f"{feature} = 1")


# Add missing feature columns in test_df with value 0
missing_features = set(features.columns) - set(test_dataframe.columns)
for feature in missing_features:
    test_dataframe[feature] = 0

# Reorder columns to match the training data
test_dataframe = test_dataframe[features.columns]

# Make predictions for the test instance
predictions = model.predict(test_dataframe)

# Convert the predictions back to original labels
predicted_labels = ["no" if pred == 1 else "yes" for pred in predictions]

print("\nPridiction = ", predicted_labels)


## Naive Bayes (Random Dataset)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

# Create the dataset
dataSet = {
    "Day": ["D1", "D2", "D3", "D4", "D5", "D6", "D7", "D8", "D9", "D10", "D11", "D12", "D13", "D14"],
    "Outlook": ["Sunny", "Sunny", "Overcast", "Rain", "Rain", "Rain", "Overcast", "Sunny", "Sunny", "Rain", "Sunny", "Overcast", "Overcast", "Rain"],
    "Temperature": ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool", "Cool", "Mild", "Cool", "Mild", "Mild", "Mild", "Hot", "Mild"],
    "Humidity": ["High", "High", "High", "High", "Normal", "Normal", "Normal", "High", "Normal", "Normal", "Normal", "High", "Normal", "High"],
    "Wind": ["Weak", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Weak", "Weak", "Strong", "Strong", "Weak", "Strong"],
    "Play Tennis": ["No", "No", "Yes", "Yes", "Yes", "No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No"]
}

# Convert the dataset to a DataFrame
dataFrame = pd.DataFrame(dataSet)
print("Data:")
print(pd.DataFrame(dataSet, columns=["Day", "Outlook", "Temperature",
      "Humidity", "Wind", "Play Tennis"], index=range(1, 15), dtype="category"))

# One-hot encode the categorical features
dataFrame = pd.get_dummies(dataFrame)

# Split the dataset into features and target variable
X = dataFrame.drop(["Play Tennis_No", "Play Tennis_Yes"], axis=1)
y = dataFrame["Play Tennis_No"]

# Split the data into train and test sets using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Create an instance of the Naïve Bayes classifier
model = GaussianNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions for the test set
y_pred = model.predict(X_test)

# Convert the predictions back to original labels
predicted_labels = ["No" if pred == 1 else "Yes" for pred in y_pred]


# Define the test instance
test_instance = {
    "Outlook_Sunny": 0,
    "Outlook_Overcast": 0,
    "Outlook_Rain": 1,
    "Temperature_Hot": 0,
    "Temperature_Mild": 1,
    "Temperature_Cool": 0,
    "Humidity_High": 1,
    "Humidity_Normal": 0,
    "Wind_Weak": 0,
    "Wind_Strong": 1
}

# Convert the test instance to a DataFrame
test_dataframe = pd.DataFrame([test_instance])

# Add missing feature columns in test_dataframe with value 0
missing_features = set(X_train.columns) - set(test_dataframe.columns)
for feature in missing_features:
    test_dataframe[feature] = 0

# Reorder columns to match the training data
test_dataframe = test_dataframe[X_train.columns]

# Make predictions for the test instance
prediction = model.predict(test_dataframe)[0]

# Convert the prediction back to original label
predicted_label = "No" if prediction == 1 else "Yes"

print("\nTest Instance:")
for feature, value in test_instance.items():
    if value == 1:
        print("\t\t", f"\t{feature} = 1")

print("\nPrediction:", predicted_label)


## Naive Bayes (Sir)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

dataSet = {
    "CPU Usage": ["High", "High", "Low", "High", "Low", "Low", "High", "Low", "Low", "High", "Low", "High", "High", "Low"],
    "Memory Usage": ["High", "Low", "Low", "High", "Low", "High", "Low", "High", "Low", "Low", "High", "Low", "Low", "High"],
    "Disk Usage": ["High", "High", "Low", "High", "Low", "Low", "High", "High", "Low", "Low", "High", "High", "High", "Low"],
    "Network Usage": ["Low", "High", "Low", "Low", "High", "High", "Low", "Low", "High", "Low", "High", "High", "Low", "High"],
    "Is Slow": ["Yes", "Yes", "No", "Yes", "No", "Yes", "No", "Yes", "No", "No", "Yes", "Yes", "No", "Yes"]
}

dataFrame = pd.get_dummies(pd.DataFrame(dataSet))
X = dataFrame.drop(["Is Slow_No", "Is Slow_Yes"], axis=1)
y = dataFrame["Is Slow_Yes"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
predicted_labels = ["No" if pred == 0 else "Yes" for pred in y_pred]

test_instance = {
    "CPU Usage_High": 1,
    "Memory Usage_High": 1,
    "Disk Usage_High": 1,
    "Network Usage_Low": 1
}
test_dataframe = pd.DataFrame([test_instance])
missing_features = set(X_train.columns) - set(test_dataframe.columns)
for feature in missing_features:
    test_dataframe[feature] = 0
test_dataframe = test_dataframe[X_train.columns]

prediction = model.predict(test_dataframe)[0]
predicted_label = "No" if prediction == 0 else "Yes"

print(predicted_label)


## Naive Bayes (Sir 2)

In [None]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB

dataSetStudent = {
    "age": ["<=30", "<=30", "30...40", ">40", ">40", ">40", "31...40", "<=30", "<=30", ">40", "<=30", "31...40", "31...40", ">40"],
    "income": ["high", "high", "high", "medium", "low", "low", "low", "medium", "low", "medium", "medium", "medium", "high", "medium"],
    "student": ["no", "no", "no", "no", "yes", "yes", "yes", "no", "yes", "yes", "yes", "no", "yes", "no"],
    "credit_rating": ["fair", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "fair", "fair", "excellent", "excellent", "fair", "excellent"],
    "buys_computer": ["no", "no", "yes", "yes", "yes", "no", "yes", "no", "yes", "yes", "yes", "yes", "yes", "no"]
}

dataFrame = pd.get_dummies(pd.DataFrame(dataSetStudent))
features = dataFrame.drop(["buys_computer_no", "buys_computer_yes"], axis=1)
y = dataFrame["buys_computer_no"]

model = GaussianNB()
model.fit(features, y)

test_instance = {
    "age_<=30": 1,
    "income_medium": 1,
    "student_yes": 1,
    "credit_rating_fair": 1
}

test_dataframe = pd.DataFrame([test_instance])
missing_features = set(features.columns) - set(test_dataframe.columns)
for feature in missing_features:
    test_dataframe[feature] = 0
test_dataframe = test_dataframe[features.columns]

predictions = model.predict(test_dataframe)
predicted_labels = ["no" if pred == 1 else "yes" for pred in predictions]

print(predicted_labels)
