Using the glass.csv file on Github, answer the following questions-
1)Try different thresholds for computing predictions using 'Al' column. By default it is 0.5. Use predict_proba function to compute probabilities and then try custom thresholds and see their impact on Accuracy, Precision and Recall.
2)Do the same analysis for other columns
3)Fit a Logistic Regression Model on all features. Remember to preprocess data(eg. normalization and one hot encoding).

In [1]:
import pandas as pd

# Load the dataset to understand its structure
file_path = '/content/glass.csv'
glass_data = pd.read_csv(file_path)
glass_data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [2]:
#1
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Define X and y
X = glass_data[['Al']]  # Predictor - Aluminum content
y = (glass_data['Type'] > 1).astype(int)  # Target - Binary classification

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Predict probabilities for the test set
y_probs = model.predict_proba(X_test_scaled)[:, 1]  # Probability estimates for the positive class

# Define a function to calculate metrics at various thresholds
def evaluate_threshold(threshold):
    y_pred = (y_probs >= threshold).astype(int)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    return accuracy, precision, recall

# Evaluate the model at several thresholds
thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
results = {thr: evaluate_threshold(thr) for thr in thresholds}
results


{0.3: (0.6976744186046512, 0.7317073170731707, 0.9375),
 0.4: (0.7209302325581395, 0.7631578947368421, 0.90625),
 0.5: (0.6976744186046512, 0.7567567567567568, 0.875),
 0.6: (0.7441860465116279, 0.8620689655172413, 0.78125),
 0.7: (0.813953488372093, 1.0, 0.75)}

In [3]:
#2
# Define X for Magnesium and re-use the target y
X_Mg = glass_data[['Mg']]  # Predictor - Magnesium content

# Split data for Magnesium content
X_train_Mg, X_test_Mg, y_train, y_test = train_test_split(X_Mg, y, test_size=0.2, random_state=42)

# Standardize Magnesium features
X_train_Mg_scaled = scaler.fit_transform(X_train_Mg)
X_test_Mg_scaled = scaler.transform(X_test_Mg)

# Train a logistic regression model on Magnesium
model_Mg = LogisticRegression(random_state=42)
model_Mg.fit(X_train_Mg_scaled, y_train)

# Predict probabilities for the test set based on Magnesium
y_probs_Mg = model_Mg.predict_proba(X_test_Mg_scaled)[:, 1]  # Probability estimates for the positive class

# Evaluate the model at several thresholds for Magnesium
results_Mg = {thr: evaluate_threshold(thr) for thr in thresholds}
results_Mg


{0.3: (0.6976744186046512, 0.7317073170731707, 0.9375),
 0.4: (0.7209302325581395, 0.7631578947368421, 0.90625),
 0.5: (0.6976744186046512, 0.7567567567567568, 0.875),
 0.6: (0.7441860465116279, 0.8620689655172413, 0.78125),
 0.7: (0.813953488372093, 1.0, 0.75)}

In [4]:
#3
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline

# Define X and y for the full model
X_full = glass_data.drop('Type', axis=1)  # All features except the target
y_full = glass_data['Type']  # Multi-class target

# Split data into train and test sets
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X_full, y_full, test_size=0.2, random_state=42)

# Create a pipeline for logistic regression with preprocessing
pipeline = make_pipeline(StandardScaler(), LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=42))
pipeline.fit(X_train_full, y_train_full)

# Predict on the test set
y_pred_full = pipeline.predict(X_test_full)

# Evaluate the model
report = classification_report(y_test_full, y_pred_full)
report


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           1       0.69      0.82      0.75        11\n           2       0.62      0.71      0.67        14\n           3       0.00      0.00      0.00         3\n           5       1.00      0.50      0.67         4\n           6       1.00      0.67      0.80         3\n           7       0.80      1.00      0.89         8\n\n    accuracy                           0.72        43\n   macro avg       0.69      0.62      0.63        43\nweighted avg       0.69      0.72      0.69        43\n'

In [5]:
print(report)

              precision    recall  f1-score   support

           1       0.69      0.82      0.75        11
           2       0.62      0.71      0.67        14
           3       0.00      0.00      0.00         3
           5       1.00      0.50      0.67         4
           6       1.00      0.67      0.80         3
           7       0.80      1.00      0.89         8

    accuracy                           0.72        43
   macro avg       0.69      0.62      0.63        43
weighted avg       0.69      0.72      0.69        43

