In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from scipy import stats

# --- e. Data Cleaning ---
df = pd.read_csv("/Users/akshay/Desktop/dsbda_practical/newdata/iris.csv")

# Remove '?' and NA values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Convert to numeric where applicable
for col in df.columns[:-1]:  # skip 'variety'
    df[col] = pd.to_numeric(df[col], errors='coerce')
    
# Remove rows with any negative values
df = df[(df.select_dtypes(include=[np.number]) >= 0).all(axis=1)]

print("✅ Cleaned Data Sample:")
print(df.head())

# --- f. Error Correcting (Outlier Detection and Removal) ---
# Use Z-score method to detect outliers
numeric_df = df.select_dtypes(include=[np.number])
z_scores = np.abs(stats.zscore(numeric_df))
df = df[(z_scores < 3).all(axis=1)]

print("\n✅ After Removing Outliers:")
print(df.describe())

# --- g. Data Transformation ---
# Encode target labels
le = LabelEncoder()
df['variety_encoded'] = le.fit_transform(df['variety'])

# Normalize features
X = df.drop(['variety', 'variety_encoded'], axis=1)
y = df['variety_encoded']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("\n✅ Transformed Features:")
print(pd.DataFrame(X_scaled, columns=X.columns).head())

# --- h. Build Model using Regression and Naïve Bayes ---

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_preds)

# --- Results ---
print("\n✅ Accuracy Comparison:")
print(f"Logistic Regression Accuracy: {lr_acc * 100:.2f}%")
print(f"Naive Bayes Accuracy: {nb_acc * 100:.2f}%")

✅ Cleaned Data Sample:
   sepal.length  sepal.width  petal.length  petal.width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.2  Setosa
4           5.0          3.6           1.4          0.2  Setosa

✅ After Removing Outliers:
       sepal.length  sepal.width  petal.length  petal.width
count    149.000000   149.000000    149.000000   149.000000
mean       5.844295     3.048322      3.773154     1.204698
std        0.830775     0.423085      1.761435     0.761962
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.400000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.200000      6.900000     2.500000

✅ Transformed Features:
