Title: Train a Classification Model (Logistic Regression)

Task 1: Email Spam Detection<br>
Dataset: Use a dataset containing labeled emails as spam or not-spam, with features such as word frequency.

In [5]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset (replace with actual dataset path)
import kagglehub

# Download the dataset from Kaggle
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")
print("Path to dataset files:", path)

# Load the dataset
data = pd.read_csv(f"{path}/spam.csv", encoding='latin-1')
data = data.rename(columns={"v1": "label", "v2": "text"})  # Rename columns for clarity
data = data[["text", "label"]]  # Keep only relevant columns

# Convert labels to binary (spam = 1, not-spam = 0)
data["label"] = data["label"].map({"spam": 1, "ham": 0})

# Check for missing values
if data.isnull().sum().any():
    print("Missing values detected. Please clean the dataset.")
else:
    print("No missing values detected.")

# Split the dataset into features (X) and target (y)
X = data["text"]
y = data["label"]

# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer(stop_words="english")
X_vectorized = vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increased max_iter to ensure convergence

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Path to dataset files: /home/vscode/.cache/kagglehub/datasets/uciml/sms-spam-collection-dataset/versions/1
No missing values detected.

Accuracy: 0.97847533632287

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
[[965   0]
 [ 24 126]]


Task 2: Predicting Tumor Malignancy<br>
Dataset: Use a dataset that contains features from tumor datasets such as size and shape indicators.

In [6]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load the dataset (replace with actual dataset path)
# Example dataset: Breast Cancer Wisconsin Dataset from sklearn
from sklearn.datasets import load_breast_cancer

# Load the dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)  # Features
y = pd.Series(data.target, name="target")  # Target (0 = malignant, 1 = benign)

# Check for missing values
if X.isnull().sum().any():
    print("Missing values detected. Please clean the dataset.")
else:
    print("No missing values detected.")

# Feature scaling (optional but recommended for Logistic Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increased max_iter to ensure convergence

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

No missing values detected.

Accuracy: 0.9736842105263158

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


Confusion Matrix:
[[41  2]
 [ 1 70]]


Task 3: Wine Quality Classification<br>
Dataset: Use a dataset that contains chemical properties of wine along with a quality rating.
Treat quality as a binary classification (high vs. low).

In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import kagglehub

# Download the dataset from Kaggle
path = kagglehub.dataset_download("uciml/red-wine-quality-cortez-et-al-2009")
print("Path to dataset files:", path)

# Load the dataset
data = pd.read_csv(f"{path}/winequality-red.csv", sep=";")

# Check if the dataset contains the 'quality' column
if "quality" not in data.columns:
    raise ValueError("The dataset does not contain the 'quality' column. Please check the dataset.")

# Convert quality into binary classification (high vs. low)
# Assuming quality >= 7 is "high" (1) and < 7 is "low" (0)
data["quality"] = (data["quality"] >= 7).astype(int)

# Feature Engineering: Create a new feature based on acidity levels
def categorize_acidity(fixed_acidity):
    if fixed_acidity < 7:
        return "low"
    elif 7 <= fixed_acidity <= 10:
        return "medium"
    else:
        return "high"

# Apply the function to create the new feature
data["acidity_level"] = data["fixed acidity"].apply(categorize_acidity)

# Convert the categorical feature into numerical values using one-hot encoding
data = pd.get_dummies(data, columns=["acidity_level"], drop_first=True)

# Split the dataset into features (X) and target (y)
X = data.drop("quality", axis=1)  # Features
y = data["quality"]  # Target

# Check for missing values
if X.isnull().sum().any():
    print("Missing values detected. Please clean the dataset.")
else:
    print("No missing values detected.")

# Feature scaling (optional but recommended for Logistic Regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increased max_iter to ensure convergence

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Path to dataset files: /home/vscode/.cache/kagglehub/datasets/uciml/red-wine-quality-cortez-et-al-2009/versions/2


ValueError: The dataset does not contain the 'quality' column. Please check the dataset.