In [14]:
from sklearn import datasets
from sklearn import preprocessing

covtype = datasets.fetch_covtype()
X = preprocessing.scale(covtype.data)
y = covtype.target

In [24]:
print(covtype.DESCR)

.. _covtype_dataset:

Forest covertypes
-----------------

The samples in this dataset correspond to 30×30m patches of forest in the US,
collected for the task of predicting each patch's cover type,
i.e. the dominant species of tree.
There are seven covertypes, making this a multiclass classification problem.
Each sample has 54 features, described on the
`dataset's homepage <https://archive.ics.uci.edu/ml/datasets/Covertype>`__.
Some of the features are boolean indicators,
while others are discrete or continuous measurements.

**Data Set Characteristics:**

    Classes                        7
    Samples total             581012
    Dimensionality                54
    Features                     int

:func:`sklearn.datasets.fetch_covtype` will load the covertype dataset;
it returns a dictionary-like object
with the feature matrix in the ``data`` member
and the target values in ``target``.
The dataset will be downloaded from the web if necessary.



Batch logistic regression.

In [22]:
import time
import numpy as np
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection

tic = time.time()

model = linear_model.LogisticRegression(solver='sag', multi_class='auto', n_jobs=-1)
cv = model_selection.KFold(n_splits=8, shuffle=True, random_state=42)
y_pred = np.empty(shape=(len(X), cv.n_splits))  # stores the predictions

for fold, (train_idx, test_idx) in enumerate(cv.split(X[:3000], y[:3000])):
    
    # Split train from test
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = y[train_idx]
    
    # Train, then predict
    model.fit(X_train, y_train)
    y_pred[test_idx, fold] = model.predict(X_test)
    
toc = time.time()
    
# Calculate the accuracy for each fold
scores = [metrics.accuracy_score(y, y_pred[:, fold]) for fold in range(cv.n_splits)]

print(f'Accuracy: {np.mean(scores):.2%} ± {np.std(scores):.2%} (took {toc - tic:.3f} seconds)')



Accuracy: 0.05% ± 0.00% (took 3.633 seconds)


In [21]:
y_train.shape

(2625,)

In [3]:
scores = [metrics.accuracy_score(y_test, y_pred[:, fold]) for fold in range(cv.n_splits)]

[0;31mInit signature:[0m
[0mlinear_model[0m[0;34m.[0m[0mLogisticRegression[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpenalty[0m[0;34m=[0m[0;34m'l2'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdual[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.0001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mC[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfit_intercept[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mintercept_scaling[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mclass_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msolver[0m[0;34m=[0m[0;34m'warn'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmulti_class[0m[0;34m=[0m[0;34m'warn'[0m[0;3