使用MNIST数据集的灰度直方图作为特征，使用LogisticRegression，decisionTree，SVM模型分别对数据集分类并查看测试集的评分

In [72]:
import numpy as np
import sklearn
from sklearn.datasets import fetch_mldata
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import learning_curve
from sklearn.svm import LinearSVC
from sklearn import svm
import random

mnist = fetch_mldata('MNIST-original', data_home='./dataSet')



In [73]:
x, y = mnist['data'], mnist['target']

In [74]:
print(x.shape)
print(y.shape)

(70000, 784)
(70000,)


In [110]:
train_img, test_img, train_label, test_label = train_test_split(x, y, test_size=1/7.0, shuffle=True)

In [111]:
print(train_img.shape)
print(train_label.shape)

(60000, 784)
(60000,)


In [112]:
train_img[0,:].reshape(28, 28)

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  14, 254, 195,
        118, 118,  32,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  14, 179, 253, 253,
        253, 253, 239, 234, 153,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0],
       [  

In [113]:
train_fea = np.zeros(train_img.shape)
for i in range(len(train_img)):
    for j in range(0, 256):
        count = (train_img[i,:] == j).sum()
        train_fea[i,j] = count
train_fea

array([[556.,   0.,   0., ...,   0.,   0.,   0.],
       [637.,   0.,   0., ...,   0.,   0.,   0.],
       [639.,   0.,   0., ...,   0.,   0.,   0.],
       ...,
       [579.,   1.,   0., ...,   0.,   0.,   0.],
       [628.,   0.,   1., ...,   0.,   0.,   0.],
       [665.,   1.,   3., ...,   0.,   0.,   0.]])

In [114]:
np.sum(train_fea, axis=1)

array([784., 784., 784., ..., 784., 784., 784.])

In [115]:
test_fea = np.zeros(test_img.shape)
for i in range(0, len(test_img)):
    for j in range(0, 256):
        count = (train_img[i,:] == j).sum()
        test_fea[i,j] = count
test_fea

array([[556.,   0.,   0., ...,   0.,   0.,   0.],
       [637.,   0.,   0., ...,   0.,   0.,   0.],
       [639.,   0.,   0., ...,   0.,   0.,   0.],
       ...,
       [579.,   1.,   0., ...,   0.,   0.,   0.],
       [653.,   0.,   0., ...,   0.,   0.,   0.],
       [640.,   2.,   0., ...,   0.,   0.,   0.]])

In [116]:
np.sum(test_fea, axis=1)

array([784., 784., 784., ..., 784., 784., 784.])

In [117]:
# LogisticRegression
clf = LogisticRegression()

In [118]:
clf.fit(train_fea, train_label)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [119]:
pre = clf.predict(test_fea)

In [120]:
acc = metrics.accuracy_score(pre, test_label)
print('acc:%.2f %%' %(100 * acc))

acc:10.03 %


In [121]:
#  Decision Tree
clf2 = tree.DecisionTreeClassifier()

In [122]:
clf2.fit(train_fea, train_label)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [124]:
pre = clf2.predict(test_fea)
acc = metrics.accuracy_score(pre, test_label)
print('acc:%.2f%%'%(100 * acc))

acc:9.80%


In [125]:
clf3 = LinearSVC(C=1.0)

In [126]:
clf3.fit(train_fea, train_label)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [127]:
pre = clf3.predict(test_fea)
acc = metrics.accuracy_score(pre, test_label)
print('acc:%.2f%%'%(100 * acc))

acc:9.56%
