A1

In [7]:
import pandas as pd
data = {
    'age': ['<=30', '<=30', '31…40', '>40', '>40', '>40', '31…40', '<=30', '<=30', '>40', '<=30', '31…40', '31…40', '>40'],
    'income': ['high', 'high', 'high', 'medium', 'low', 'low', 'low', 'medium', 'low', 'medium', 'medium', 'medium', 'high', 'medium'],
    'student': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'excellent'],
    'buys_computer': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']
}

df = pd.DataFrame(data)
c_counts = df['buys_computer'].value_counts()
prior_prob = c_counts / len(df['buys_computer'])
print("Prior Probabilities for Each Class:")
print(prior_prob)


Prior Probabilities for Each Class:
yes    0.642857
no     0.357143
Name: buys_computer, dtype: float64


A2

In [2]:
def class_conditional_density(feature, feature_value, target_class):
    numerator = df[(df[feature] == feature_value) & (df['buys_computer'] == target_class)].shape[0]
    denominator = df[df['buys_computer'] == target_class].shape[0]
    return numerator / denominator if denominator != 0 else 0

features = ['age', 'income', 'student', 'credit_rating']
classes = df['buys_computer'].unique()

for feature in features:
    for feature_value in df[feature].unique():
        for target_class in classes:
            density = class_conditional_density(feature, feature_value, target_class)
            print(f'P({feature}={feature_value} | buys_computer={target_class}) = {density}')



P(age=<=30 | buys_computer=no) = 0.6
P(age=<=30 | buys_computer=yes) = 0.2222222222222222
P(age=31...40 | buys_computer=no) = 0.0
P(age=31...40 | buys_computer=yes) = 0.4444444444444444
P(age=>40 | buys_computer=no) = 0.4
P(age=>40 | buys_computer=yes) = 0.3333333333333333
P(income=high | buys_computer=no) = 0.4
P(income=high | buys_computer=yes) = 0.2222222222222222
P(income=medium | buys_computer=no) = 0.4
P(income=medium | buys_computer=yes) = 0.4444444444444444
P(income=low | buys_computer=no) = 0.2
P(income=low | buys_computer=yes) = 0.3333333333333333
P(student=no | buys_computer=no) = 0.8
P(student=no | buys_computer=yes) = 0.3333333333333333
P(student=yes | buys_computer=no) = 0.2
P(student=yes | buys_computer=yes) = 0.6666666666666666
P(credit_rating=fair | buys_computer=no) = 0.4
P(credit_rating=fair | buys_computer=yes) = 0.6666666666666666
P(credit_rating=excellent | buys_computer=no) = 0.6
P(credit_rating=excellent | buys_computer=yes) = 0.3333333333333333


A3

In [3]:
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(df['age'], [df['income'], df['student'], df['credit_rating']])

chi2, p, _, _ = chi2_contingency(contingency_table)

print(f"Chi2 value: {chi2}")
print(f"P-value: {p}")

alpha = 0.05
if p < alpha:
    print("The features are dependent (reject the null hypothesis of independence).")
else:
    print("The features are independent (fail to reject the null hypothesis of independence).")


Chi2 value: 12.95
P-value: 0.6764100579553458
The features are independent (fail to reject the null hypothesis of independence).


A4

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

label_encoder = LabelEncoder()
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column])

X = df.drop('buys_computer', axis=1)
y = df['buys_computer']
Tr_X, Te_X, Tr_y, Te_y = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(Tr_X, Tr_y)

predictions = model.predict(Te_X)

accuracy = accuracy_score(Te_y, predictions)
print(f'Accuracy: {accuracy}')


Accuracy: 0.6666666666666666


A5

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('/content/healthcare-dataset-stroke-data.csv')  # Replace 'your_dataset.csv' with the actual file path

df['bmi'] = pd.to_numeric(df['bmi'], errors='coerce')

label_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
label_encoder = {column: LabelEncoder() for column in label_columns}

for column in label_columns:
    df[column] = label_encoder[column].fit_transform(df[column])

X = df.drop(['id', 'stroke'], axis=1)
y = df['stroke']

Tr_X, Te_X, Tr_y, Te_y = train_test_split(X, y, test_size=0.2, random_state=42)

imputer = SimpleImputer(strategy='mean')
Tr_X_imputed = imputer.fit_transform(Tr_X)
Te_X_imputed = imputer.transform(Te_X)

model = MultinomialNB()
model.fit(Tr_X_imputed, Tr_y)

predictions = model.predict(Te_X_imputed)

accuracy = accuracy_score(Te_y, predictions)
print(f'Accuracy: {accuracy}')
print('\nClassification Report:')
print(classification_report(Te_y, predictions))


Accuracy: 0.8052837573385518

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.83      0.89       960
           1       0.13      0.40      0.20        62

    accuracy                           0.81      1022
   macro avg       0.54      0.62      0.54      1022
weighted avg       0.91      0.81      0.85      1022

