In [2]:
from datasets import load_dataset
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# ignore warnings
import warnings

warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn import metrics

  from .autonotebook import tqdm as notebook_tqdm


# Dataset size

- Letter: (20000, 17)
- Covertype: (581012, 55)
- House16: (22784, 17)
- Shuffle: (43500, 10)

# Covertype

In [3]:
dataset = load_dataset("mstz/covertype", "covertype")["train"]

Generating train split: 100%|██████████| 581012/581012 [00:00<00:00, 924288.77 examples/s] 


In [4]:
df = pd.DataFrame(dataset)

In [5]:
df.shape

(581012, 55)

In [6]:
df.head()

Unnamed: 0,elevation,aspect,slope,horizontal_distance_to_hydrology,vertical_distance_to_hydrology,horizontal_distance_to_roadways,hillshade_9am,hillshade_noon,hillshade_3pm,horizontal_distance_to_fire_points,...,soil_type_id_31,soil_type_id_32,soil_type_id_33,soil_type_id_34,soil_type_id_35,soil_type_id_36,soil_type_id_37,soil_type_id_38,soil_type_id_39,cover_type
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,False,False,False,False,False,False,False,False,False,4
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,False,False,False,False,False,False,False,False,False,4
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,False,False,False,False,False,False,False,False,False,1
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,False,False,False,False,False,False,False,False,False,1
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,False,False,False,False,False,False,False,False,False,4


In [8]:
df["cover_type"].value_counts()

cover_type
1    283301
0    211840
2     35754
6     20510
5     17367
4      9493
3      2747
Name: count, dtype: int64

In [9]:
# train logistic regression model

X = df.drop(columns=["cover_type"])
y = df["cover_type"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
# Define the parameter grid
param_grid = {"C": [0.1, 1, 10, 100], "solver": ["liblinear", "saga"]}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")

# Fit GridSearchCV
grid_search.fit(X_train, Y_train)

# Get the best model
model = grid_search.best_estimator_
model.fit(X_train, Y_train)

# make predictions
predictions = model.predict(X_test)

# evaluate model
accuracy = (predictions == Y_test).mean()

# Majority classifier
majority_class = Y_train.mode()[0]
majority_accuracy = (Y_test == majority_class).mean()

In [10]:
print(f"Accuracy: {accuracy}")
print(f"Majority class accuracy: {majority_accuracy}")

Accuracy: 0.7063500942316463
Majority class accuracy: 0.4876638296773749


In [11]:
print(f"F1 Score : {f1_score(Y_test, predictions, average=None)}")

F1 Score : [0.68523252 0.76533327 0.70501181 0.20211161 0.         0.12639571
 0.52986927]


In [12]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='macro')}")

F1 Score : 0.43056488503672374


In [13]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='micro')}")

F1 Score : 0.7063500942316463


In [14]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='weighted')}")

F1 Score : 0.6896453632597755


In [15]:
print(f"Classification Report : {classification_report(Y_test, predictions)}")

Classification Report :               precision    recall  f1-score   support

           0       0.70      0.67      0.69     42389
           1       0.72      0.81      0.77     56668
           2       0.63      0.79      0.71      7139
           3       0.72      0.12      0.20       570
           4       0.00      0.00      0.00      1918
           5       0.29      0.08      0.13      3494
           6       0.79      0.40      0.53      4025

    accuracy                           0.71    116203
   macro avg       0.55      0.41      0.43    116203
weighted avg       0.69      0.71      0.69    116203



In [16]:
fpr, tpr, thresholds = metrics.roc_curve(Y_test, predictions, pos_label=2)
metrics.auc(fpr, tpr)

0.9183286769544912

# House 16

In [3]:
dataset = load_dataset("mstz/house16", "house16")["train"]

In [4]:
df = pd.DataFrame(dataset)

In [5]:
df.shape

(22784, 17)

In [None]:
df.head()

In [None]:
# train logistic regression model

X = df.drop(columns=["class"])
y = df["class"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
# Define the parameter grid
param_grid = {"C": [0.1, 1, 10, 100], "solver": ["liblinear", "saga"]}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")

# Fit GridSearchCV
grid_search.fit(X_train, Y_train)

# Get the best model
model = grid_search.best_estimator_
model.fit(X_train, Y_train)

# make predictions
predictions = model.predict(X_test)

# evaluate model
accuracy = (predictions == Y_test).mean()
print(f"Accuracy: {accuracy}")

# Majority classifier
majority_class = Y_train.mode()[0]
majority_accuracy = (Y_test == majority_class).mean()
print(f"Majority class accuracy: {majority_accuracy}")

# Shuttle

In [9]:
dataset = load_dataset("mstz/shuttle", "shuttle")["train"]

In [10]:
df = pd.DataFrame(dataset)

In [11]:
df.shape

(43500, 10)

In [None]:
df.head()

In [None]:
df["class"].value_counts()

In [None]:
# train logistic regression model

X = df.drop(columns=["class"])
y = df["class"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
# Define the parameter grid
param_grid = {"C": [0.1, 1, 10, 100], "solver": ["liblinear", "saga"]}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")

# Fit GridSearchCV
grid_search.fit(X_train, Y_train)

# Get the best model
model = grid_search.best_estimator_
model.fit(X_train, Y_train)

# make predictions
predictions = model.predict(X_test)

# evaluate model
accuracy = (predictions == Y_test).mean()

# Majority classifier
majority_class = Y_train.mode()[0]
majority_accuracy = (Y_test == majority_class).mean()

In [None]:
print(f"Accuracy: {accuracy}")
print(f"Majority class accuracy: {majority_accuracy}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average=None)}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='macro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='micro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='weighted')}")

In [None]:
print(f"Classification Report : {classification_report(Y_test, predictions)}")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_test, predictions, pos_label=2)
metrics.auc(fpr, tpr)

# Letter

In [6]:
dataset = load_dataset("mstz/letter", "letter")["train"]

In [7]:
df = pd.DataFrame(dataset)

In [8]:
df.shape

(20000, 17)

In [None]:
df.head()

In [None]:
df["letter"].value_counts()

In [49]:
# train logistic regression model

X = df.drop(columns=["letter"])
y = df["letter"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
# Define the parameter grid
param_grid = {"C": [0.1, 1, 10, 100], "solver": ["liblinear", "saga"]}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")

# Fit GridSearchCV
grid_search.fit(X_train, Y_train)

# Get the best model
model = grid_search.best_estimator_
model.fit(X_train, Y_train)

# make predictions
predictions = model.predict(X_test)

# evaluate model
accuracy = (predictions == Y_test).mean()

# Majority classifier
majority_class = Y_train.mode()[0]
majority_accuracy = (Y_test == majority_class).mean()

In [None]:
print(f"Accuracy: {accuracy}")
print(f"Majority class accuracy: {majority_accuracy}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average=None)}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='macro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='micro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='weighted')}")

In [None]:
print(f"Classification Report : {classification_report(Y_test, predictions)}")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_test, predictions, pos_label=2)
metrics.auc(fpr, tpr)