In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
column_names = [f'feature_{i}' for i in range(54)] + ['label']
data = pd.read_csv(url, header=None, names=column_names)

# データを特徴量とラベルに分割
X = data.drop('label', axis=1)
y = data['label']

# データを訓練セットとテストセットに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

# データのスケーリング
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = [
    ('Logistic Regression', LogisticRegression(random_state=42, max_iter=10000)),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]


# 各モデルの交差検証と性能指標の計算
for name, model in models:
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    print(f"{name} - Mean accuracy: {cv_scores.mean()}, Std: {cv_scores.std()}")


Logistic Regression - Mean accuracy: 0.725454134734338, Std: 0.0014464903155140556
K-Nearest Neighbors - Mean accuracy: 0.9184869745948088, Std: 0.0008018152906637357
Random Forest - Mean accuracy: 0.9473381397993206, Std: 0.0004078796893618986
Gradient Boosting - Mean accuracy: 0.7729329230777646, Std: 0.0018131356337330673


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

# モデルをインスタンス化
models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('SVM', SVC(random_state=42)),
    ('KNN', KNeighborsClassifier()),
    ('Decision Tree', DecisionTreeClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42))
]

# 各モデルの交差検証と性能指標の計算
for name, model in models:
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{name} - Mean accuracy: {cv_scores.mean()}, Std: {cv_scores.std()}")

# 最適なモデルの選択
best_model = max(models, key=lambda x: cross_val_score(x[1], X_train, y_train, cv=5, scoring='accuracy').mean())[1]

# 最適なモデルでの学習と予測
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

# 結果の表示
print("Best model:", best_model.__class__.__name__)
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression - Mean accuracy: 0.6191149546017878, Std: 0.002549062688604631
