# **Task-01**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
heart_df = pd.read_csv('heart.csv')

print("Dataset shape:", heart_df.shape)
print(heart_df.head())
# Check for missing values
print("\nMissing values per column:\n", heart_df.isnull().sum())

# Features and target
X = heart_df.drop("target", axis=1)
y = heart_df["target"].astype(int)  # convert to integer for classification

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

penalties = ["l1", "l2", "elasticnet"]
results = []

for pen in penalties:
    print(f"\nTraining Logistic Regression with penalty = {pen}")

    # L1 and ElasticNet require solver='saga'
    if pen in ["l1", "elasticnet"]:
        solver = "saga"
    else:
        solver = "lbfgs"  # default for l2

    # ElasticNet requires l1_ratio parameter
    if pen == "elasticnet":
        model = LogisticRegression(
            penalty=pen,
            solver=solver,
            l1_ratio=0.5,
            max_iter=5000,
            random_state=42
        )
    else:
        model = LogisticRegression(
            penalty=pen,
            solver=solver,
            max_iter=5000,
            random_state=42
        )

    model.fit(X_train, y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_acc = accuracy_score(y_train, y_train_pred)
    test_acc = accuracy_score(y_test, y_test_pred)

    results.append([pen, train_acc, test_acc])
    print(f"Training Accuracy: {train_acc:.4f}, Testing Accuracy: {test_acc:.4f}")

# Create a results dataframe
df_results = pd.DataFrame(results, columns=["Penalty", "Training Accuracy", "Testing Accuracy"])
print("\nComparison of Different Penalties:")
print(df_results)

Dataset shape: (1025, 14)
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  

Missing values per column:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Training Logistic Regression with penalty = l1
Training Accuracy: 0.8647, Testing Accuracy: 0.8182

Training Logisti


**Notes / Observations**
- L1 and ElasticNet require solver='saga'.
- ElasticNet requires l1_ratio parameter to combine L1 and L2 penalties.
- lbfgs solver works only with l2 penalty.
- max_iter is increased to 5000 to ensure convergence.


# **TASK - 02**

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

iris = load_iris()
X_iris = iris.data
y_iris = iris.target

scaler = StandardScaler()
X_iris_scaled = scaler.fit_transform(X_iris)

X_train, X_test, y_train, y_test = train_test_split(
    X_iris_scaled, y_iris, test_size=0.3, random_state=42, stratify=y_iris
)

solvers = ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
solver_results = []

for solver in solvers:
    try:
        model = LogisticRegression(solver=solver, max_iter=5000, multi_class='auto', random_state=42)
        model.fit(X_train, y_train)

        train_acc = accuracy_score(y_train, model.predict(X_train))
        test_acc = accuracy_score(y_test, model.predict(X_test))

        solver_results.append([solver, train_acc, test_acc])

    except Exception as e:
        solver_results.append([solver, None, None])
        print(f"Solver {solver} encountered an error: {e}")

df_solver_results = pd.DataFrame(
    solver_results, columns=["Solver", "Training Accuracy", "Testing Accuracy"]
)

print("\nComparison of Solvers on Iris Dataset:")
print(df_solver_results)


Comparison of Solvers on Iris Dataset:
            Solver  Training Accuracy  Testing Accuracy
0            lbfgs           0.980952          0.911111
1        liblinear           0.933333          0.800000
2        newton-cg           0.980952          0.911111
3  newton-cholesky           0.980952          0.911111
4              sag           0.980952          0.911111
5             saga           0.980952          0.911111




The solver affects convergence speed and sometimes accuracy.
'liblinear' is good for small datasets, but does not support all multi-class options.
'lbfgs', 'newton-cg', and 'saga' are robust for small to medium datasets and multi-class problems.
'sag' and 'saga' are faster for large datasets.
In this case (Iris dataset is small), lbfgs or newton-cg gives slightly better stability and accuracy.

Conclusion: For small datasets, lbfgs or newton-cg is stable and performs well.

In [None]:
heart_df = pd.read_csv('heart.csv')

# Features and target
X_heart = heart_df.drop("target", axis=1)
y_heart = heart_df["target"].astype(int)

# Scale features
scaler = StandardScaler()
X_heart_scaled = scaler.fit_transform(X_heart)

# Train-test split
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(
    X_heart_scaled, y_heart, test_size=0.3, random_state=42, stratify=y_heart
)

heart_solver_results = []

for solver in solvers:
    try:
        model = LogisticRegression(solver=solver, max_iter=5000, random_state=42)
        model.fit(X_train_h, y_train_h)
        train_acc = accuracy_score(y_train_h, model.predict(X_train_h))
        test_acc = accuracy_score(y_test_h, model.predict(X_test_h))
        heart_solver_results.append([solver, train_acc, test_acc])
    except Exception as e:
        heart_solver_results.append([solver, None, None])
        print(f"Solver {solver} error on Heart dataset: {e}")

df_heart_results = pd.DataFrame(
    heart_solver_results, columns=["Solver", "Training Accuracy", "Testing Accuracy"]
)

print("\nComparison of Solvers on Heart Disease Dataset:")
print(df_heart_results)



Comparison of Solvers on Heart Disease Dataset:
            Solver  Training Accuracy  Testing Accuracy
0            lbfgs           0.864714          0.818182
1        liblinear           0.864714          0.818182
2        newton-cg           0.864714          0.818182
3  newton-cholesky           0.864714          0.818182
4              sag           0.864714          0.818182
5             saga           0.864714          0.818182


Heart Disease dataset is larger than Iris dataset (~303 rows vs 150 rows).
Solvers like 'sag' and 'saga' are optimized for larger datasets and often converge faster.
Small datasets like Iris work well with 'lbfgs' and 'newton-cg'.
Accuracy difference between solvers is minor for small datasets but can affect convergence time for larger datasets.
In Heart Disease dataset, solver choice slightly affects training time; accuracy remains similar for most solvers except liblinear may converge slower for larger data.

Conclusion: Dataset is not large enough to show solver differences. All solvers converge to similar solution.

1. Briefly discuss the effect of solver on your dataset:
The solver in logistic regression decides how the model finds the best coefficients for predicting the target. On the Iris dataset, which is small, most solvers like lbfgs, newton-cg, and saga gave very high accuracy, but liblinear was a bit lower because it uses a one-vs-rest method for multi-class problems, which is not as stable for very small datasets. On the Heart Disease dataset, which is a medium-sized binary dataset, all solvers gave almost the same accuracy. This shows that for small to medium datasets, solver choice does not change accuracy much, but it can affect how fast the model learns.

2. Have you found the similarity as mentioned by Sklearn that which solver is best for small, medium or larger dataset:
Yes, the results are similar to what Sklearn recommends. For small datasets like Iris, solvers such as lbfgs, newton-cg, and newton-cholesky work well because they are stable and handle multi-class problems. Solvers like sag and saga are designed for larger datasets and use stochastic methods to converge faster. Our Heart Disease dataset is a medium-sized dataset, and all solvers gave similar results, which matches Sklearnâ€™s explanation that solver mainly affects speed for bigger datasets but not accuracy for small or medium datasets.

3. Which solver is best in your case and why:
For the Iris dataset, lbfgs or newton-cg is the best because they gave the highest accuracy and are stable for multi-class problems on small datasets. For the Heart Disease dataset, which is slightly larger, lbfgs or saga works well because they converge reliably and handle binary classification without issues. In general, the best solver depends on the size and type of dataset, but for these datasets, lbfgs is a safe and reliable choice.

4. Now copy this file and apply a new dataset (Heart Disease) and compare, does it really get affected by the size of dataset:
When we applied the same solvers to the Heart Disease dataset, which is bigger than Iris, we saw that accuracy stayed almost the same for all solvers. This shows that for small to medium datasets, the solver does not really affect accuracy much. What changes more is how fast the model reaches the final solution. For very large datasets, solvers like sag and saga would be better because they can learn faster without using too much memory. So yes, the dataset size can affect which solver is faster, but for these datasets, accuracy was not affected by the size.

# **TASK-03**

In [None]:
from sklearn.linear_model import Perceptron

# Logistic Regression
lr_model = LogisticRegression(solver='lbfgs', max_iter=5000, multi_class='auto', random_state=42)
lr_model.fit(X_train, y_train)
lr_train_acc = accuracy_score(y_train, lr_model.predict(X_train))
lr_test_acc = accuracy_score(y_test, lr_model.predict(X_test))

# Perceptron
perceptron_model = Perceptron(max_iter=5000, tol=1e-3, random_state=42)
perceptron_model.fit(X_train, y_train)
perceptron_train_acc = accuracy_score(y_train, perceptron_model.predict(X_train))
perceptron_test_acc = accuracy_score(y_test, perceptron_model.predict(X_test))

# Comparison
comparison = pd.DataFrame({
    "Model": ["Logistic Regression", "Perceptron"],
    "Training Accuracy": [lr_train_acc, perceptron_train_acc],
    "Testing Accuracy": [lr_test_acc, perceptron_test_acc]
})

print("\nComparison of Logistic Regression vs Perceptron:")
print(comparison)



Comparison of Logistic Regression vs Perceptron:
                 Model  Training Accuracy  Testing Accuracy
0  Logistic Regression           0.980952          0.911111
1           Perceptron           0.895238          0.844444




Perceptron and Logistic Regression are both used for classification, but they work differently. A Perceptron is a simple model that makes yes or no predictions by calculating a weighted sum of the inputs and applying a step function, so it cannot give probabilities and only works well if the classes are linearly separable. Logistic Regression also uses a weighted sum of inputs but passes it through a sigmoid function to output a probability between 0 and 1, which allows it to handle overlapping classes better. Logistic Regression uses gradient-based methods to find the best weights and can include regularization to prevent overfitting, making it more robust than the Perceptron.
