<a href="https://colab.research.google.com/github/omsolanki/IITJ-ML-Ops/blob/main/ML_Ops_LAB_Practice_1_Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load a dataset

In [None]:
import pandas as pd
import numpy as np

# constant
# column names
SEPAL_LENGTH = 'sepal_length'
SEPAL_WIDTH = 'sepal_width'
PETAL_LENGTH = 'petal_length'
PETAL_WIDTH = 'petal_width'
CLASS = 'class'
SEED_VALUE = 42

# Load the original Iris dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
column_names = [SEPAL_LENGTH, SEPAL_WIDTH, PETAL_LENGTH, PETAL_WIDTH, CLASS]
iris_df = pd.read_csv(url, header=None, names=column_names)
print('original dataset: ',iris_df.shape)

# Introduce missing values
np.random.seed(42)
iris_df.loc[np.random.choice(iris_df.index, size=10), SEPAL_LENGTH] = np.nan
iris_df.loc[np.random.choice(iris_df.index, size=12), SEPAL_WIDTH] = np.nan
iris_df.loc[np.random.choice(iris_df.index, size=15), PETAL_LENGTH] = np.nan
iris_df.loc[np.random.choice(iris_df.index, size=18), PETAL_WIDTH] = np.nan

# Introduce duplicates
duplicates = iris_df.sample(10, random_state=42)
iris_df = pd.concat([iris_df, duplicates], ignore_index=True)

# Introduce outliers
iris_df.loc[np.random.choice(iris_df.index, size=5), PETAL_LENGTH] = iris_df[PETAL_LENGTH] * 3
iris_df.loc[np.random.choice(iris_df.index, size=8), SEPAL_WIDTH] = iris_df[SEPAL_WIDTH] * 4

# Display the modified dataset
print('modified dataset: ',iris_df.shape)
#iris_df.tail(15)

original dataset:  (150, 5)
modified dataset:  (160, 5)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

X = iris_df[[SEPAL_LENGTH, SEPAL_WIDTH, PETAL_LENGTH, PETAL_WIDTH]]
y = iris_df[CLASS]

# Handling missing values using imputation (mean strategy for simplicity)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state = SEED_VALUE)

# Displaying the resulting shapes
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)
print("Dataset uncleaned:")
print(iris_df.describe())


Training set shape: (128, 4) (128,)
Testing set shape: (32, 4) (32,)
Dataset uncleaned:
       sepal_length  sepal_width  petal_length  petal_width
count    151.000000   148.000000    146.000000   143.000000
mean       5.862914     3.568243      4.017123     1.225175
std        0.835752     2.250080      2.457511     0.760881
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.350000
50%        5.800000     3.000000      4.400000     1.300000
75%        6.400000     3.400000      5.100000     1.800000
max        7.900000    16.000000     19.800000     2.500000


Train on uncleaned dataset

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initializing the Logistic Regression model
model = LogisticRegression(max_iter=200)

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)

# Displaying the accuracy
print(f"Accuracy on the original, uncleaned dataset: {accuracy:.4f}")

Accuracy on the original, uncleaned dataset: 1.0000


# Apply Data Cleaning Techniques

In [None]:
# Step 1: Remove Duplicates
iris_cleaned = iris_df.drop_duplicates()

# Step 2: Handle Missing Values (imputing with the mean)
imputer = SimpleImputer(strategy='mean')
iris_cleaned[[SEPAL_LENGTH, SEPAL_WIDTH, PETAL_LENGTH, PETAL_WIDTH]] = imputer.fit_transform(
    iris_cleaned[[SEPAL_LENGTH, SEPAL_WIDTH, PETAL_LENGTH, PETAL_WIDTH]])

# Step 3: Handle Outliers (using IQR to remove outliers)
Q1 = iris_cleaned[[PETAL_LENGTH]].quantile(0.25)
Q3 = iris_cleaned[[PETAL_LENGTH]].quantile(0.75)
IQR = Q3 - Q1

# Define a threshold to identify outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
iris_cleaned = iris_cleaned[~((iris_cleaned[PETAL_LENGTH] < lower_bound[PETAL_LENGTH]) |
                              (iris_cleaned[PETAL_LENGTH] > upper_bound[PETAL_LENGTH]))]

# Display the cleaned dataset
print("Dataset after cleaning:")
print(iris_cleaned.describe())


Dataset after cleaning:
       sepal_length  sepal_width  petal_length  petal_width
count    146.000000   146.000000    146.000000   146.000000
mean       5.824936     3.640841      3.761101     1.202693
std        0.811096     2.250352      1.664886     0.727780
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.400000
50%        5.818929     3.100000      4.050741     1.211364
75%        6.400000     3.575000      5.000000     1.800000
max        7.900000    16.000000      6.900000     2.500000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iris_cleaned[[SEPAL_LENGTH, SEPAL_WIDTH, PETAL_LENGTH, PETAL_WIDTH]] = imputer.fit_transform(


# Train the Logistic Regression Model on the Cleaned Dataset

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Splitting the cleaned dataset
X_clean = iris_cleaned[[SEPAL_LENGTH, SEPAL_WIDTH, PETAL_LENGTH, PETAL_WIDTH]]
y_clean = iris_cleaned[CLASS]

X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# Training the Logistic Regression model on the cleaned dataset
model_clean = LogisticRegression(max_iter=200)
model_clean.fit(X_train_clean, y_train_clean)

# Making predictions
y_pred_clean = model_clean.predict(X_test_clean)

# Calculating accuracy
accuracy_clean = accuracy_score(y_test_clean, y_pred_clean)

# Displaying the accuracy
print(f"Accuracy on the cleaned dataset: {accuracy_clean:.4f}")

# Print comparison of accuracies
print(f"Accuracy on the uncleaned dataset: {accuracy:.4f}")
print(f"Accuracy on the cleaned dataset: {accuracy_clean:.4f}")



Accuracy on the cleaned dataset: 0.9000
Accuracy on the uncleaned dataset: 1.0000
Accuracy on the cleaned dataset: 0.9000


**Uncleaned Dataset:**

*   Perfect Accuracy (1.0000)
*   The model performed perfectly, which likely indicates overfitting.
*   It learned too much from the noise and irregularities in the data, including outliers and duplicates.

**Cleaned Dataset**

*   Lower Accuracy (0.9000)
*   After cleaning the data, the model's accuracy dropped.
*   This suggests the model became less focused on the noise and more on the actual patterns in the data.

**Impact of Cleaning**

*   The cleaned dataset led to a model that probably generalizes better to new data, even though the immediate test accuracy was lower.
*   A model trained on clean data is more likely to perform well on new, unseen data, which is the ultimate goal in most real-world applications.

Conclusion:

*   Higher Accuracy Isn’t Always Better
*   The cleaned dataset’s accuracy, though lower, is likely more realistic and reliable.
