# 训练svm模型并保存 Train the svm model and save it

In [None]:
from PIL import Image
import os
import sys
import numpy as np
import time
from sklearn import svm
import joblib
 
# 获取指定路径下的所有 .png 文件 Get all .png files in the specified path
def get_file_list(path):
    return [os.path.join(path, f) for f in os.listdir(path) if f.endswith(".png")]
 
# 解析出 .png 图片文件的名称 Parse the name of the .png image file
def get_img_name_str(imgPath):
    return imgPath.split(os.path.sep)[-1]
 
  
# 将 28px * 28px 的图像数据转换成 1*784 的 numpy 向量 Convert 28px * 28px image data into 1*784 numpy vector
# 参数：imgFile--图像名  如：1.png Parameter: imgFile--image name, such as: 1.png
# 返回：1*784 的 numpy 向量 Returns: 1*784 numpy vector
def img2vector(imgFile):
    # print("in img2vector func--para:{}".format(imgFile))
    img = Image.open(imgFile).convert('L')
    img_arr = np.array(img, 'i')  # 28px * 28px 灰度图像 28px * 28px grayscale image
    img_normalization = np.round(img_arr / 255)  # 对灰度值进行归一化 Normalize the grayscale value
    img_arr2 = np.reshape(img_normalization, (1, -1))  # 1 * 784 矩阵 1 * 784 matrix
    return img_arr2
 
 
# 读取一个类别的所有数据并转换成矩阵 Read all the data of a category and convert it into a matrix
# 参数：parameter:
#    basePath: 图像数据所在的基本路径 basePath: The base path where the image data is located
#       MNIST-data/train/
#       MNIST-data/test/
#    cla：类别名称 cla: Category name
#       0,1,2,...,9
# 返回：某一类别的所有数据----[样本数量*(图像宽x图像高)] 矩阵 Returns: All data of a certain category----[sample number*(image width x image height)] matrix
def read_and_convert(imgFileList):
    dataLabel = []  # 存放类标签 Storage label
    dataNum = len(imgFileList)
    dataMat = np.zeros((dataNum, 784))  # dataNum * 784 的矩阵 Matrix of dataNum * 784
    for i in range(dataNum):
        imgNameStr = imgFileList[i]
        imgName = get_img_name_str(imgNameStr)  # 得到 当前数字的数字编号.png Get the digital number of the current number.png
        # print("imgName: {}".format(imgName))
        classTag = imgNameStr.split(os.path.sep)[-2]
        # classTag = imgName.split(".")[0].split("_")[0]  # 得到 类标签(数字) Get class label (number)
        #print(classTag)
        #print(imgNameStr)
        dataLabel.append(classTag)
        dataMat[i, :] = img2vector(imgNameStr)
    return dataMat, dataLabel
 
 
# 读取训练数据 Reading training data
def read_all_data():
    cName = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
    #path = sys.path[1]
    train_data_path = 'MNIST_data/train/0' # os.path.join(path, './MNIST_data/train/0')
    print(train_data_path)
    #train_data_path = "./MNIST_data/train/0"
    print('0')
    flist = get_file_list(train_data_path)
    #print(flist)
    dataMat, dataLabel = read_and_convert(flist)
    for c in cName:
        print(c)
        #train_data_path = os.path.join(path, './MNIST_data/train/') + c
        train_data_path = 'MNIST_data/train/' + c
        flist_ = get_file_list(train_data_path)
        dataMat_, dataLabel_ = read_and_convert(flist_)
        dataMat = np.concatenate((dataMat, dataMat_), axis=0)
        dataLabel = np.concatenate((dataLabel, dataLabel_), axis=0)
    # print(dataMat.shape)
    # print(len(dataLabel))
    return dataMat, dataLabel
 
'''
SVC参数
svm.SVC(C=1.0,kernel='rbf',degree=3,gamma='auto',coef0=0.0,shrinking=True,probability=False,
tol=0.001,cache_size=200,class_weight=None,verbose=False,max_iter=-1,decision_function_shape='ovr',random_state=None)

C：C-SVC的惩罚参数C?默认值是1.0
C越大，相当于惩罚松弛变量，希望松弛变量接近0，即对误分类的惩罚增大，趋向于对训练集全分对的情况，这样对训练集测试时
准确率很高，但泛化能力弱。C值小，对误分类的惩罚减小，允许容错，将他们当成噪声点，泛化能力较强。

kernel ：核函数，默认是rbf，可以是‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ 
       0 – 线性：u'v
 　　 1 – 多项式：(gamma*u'*v + coef0)^degree
  　　2 – RBF函数：exp(-gamma|u-v|^2)
  　　3 –sigmoid：tanh(gamma*u'*v + coef0)

degree ：多项式poly函数的维度，默认是3，选择其他核函数时会被忽略。（没用）

gamma ： ‘rbf’,‘poly’ 和‘sigmoid’的核函数参数。默认是’auto’，则会选择1/n_features

coef0 ：核函数的常数项。对于‘poly’和 ‘sigmoid’有用。（没用）

probability ：是否采用概率估计？.默认为False

shrinking ：是否采用shrinking heuristic方法，默认为true

tol ：停止训练的误差值大小，默认为1e-3

cache_size ：核函数cache缓存大小，默认为200

class_weight ：类别的权重，字典形式传递。设置第几类的参数C为weight*C(C-SVC中的C)

verbose ：允许冗余输出？

max_iter ：最大迭代次数。-1为无限制。

decision_function_shape ：‘ovo’, ‘ovr’ or None, default=None3（选用ovr，一对多）

random_state ：数据洗牌时的种子值，int值

主要调节的参数有：C、kernel、degree、gamma、coef0

SVC parameters
svm.SVC(C=1.0,kernel='rbf',degree=3,gamma='auto',coef0=0.0,shrinking=True,probability=False,
tol=0.001,cache_size=200,class_weight=None,verbose=False,max_iter=-1,decision_function_shape='ovr',random_state=None)

C: C-SVC penalty parameter C? The default value is 1.0

The larger the C value, the more it is equivalent to penalizing the slack variable. We hope that the slack variable is close to 0, that is, the penalty for misclassification increases, tending to the situation where all training sets are classified correctly. In this way, when testing the training set, the accuracy is very high, but the generalization ability is weak. The smaller the C value, the smaller the penalty for misclassification, allowing fault tolerance, treating them as noise points, and the generalization ability is stronger.

kernel ：kernel function, default is rbf, can be ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’
0 – linear: u'v
1 – polynomial: (gamma*u'*v + coef0)^degree
2 – RBF function: exp(-gamma|u-v|^2)
3 –sigmoid: tanh(gamma*u'*v + coef0)

degree ：dimension of polynomial poly function, default is 3, ignored when other kernel functions are selected. (Not used)

gamma ： kernel function parameter for ‘rbf’, ‘poly’ and ‘sigmoid’. Default is ‘auto’, which selects 1/n_features

coef0 ：constant term of kernel function. Useful for ‘poly’ and ‘sigmoid’. (Not used)

probability ：Whether to use probability estimation? .Default is False

shrinking: Whether to use the shrinking heuristic method, default is true

tol: The error value for stopping training, default is 1e-3

cache_size: The kernel function cache size, default is 200

class_weight: The weight of the category, passed in dictionary form. Set the parameter C of the category to weight*C (C in C-SVC)

verbose: Allow redundant output?

max_iter: Maximum number of iterations. -1 means unlimited.

decision_function_shape: ‘ovo’, ‘ovr’ or None, default=None3 (select ovr, one-to-many)

random_state: Seed value when shuffling data, int value

The main parameters to adjust are: C, kernel, degree, gamma, coef0
''' 
# 创建模型 Creating the Model
def create_svm(dataMat, dataLabel,path,decision='ovr'):
    clf = svm.SVC(C=1.0,kernel='rbf',decision_function_shape=decision)
    rf =clf.fit(dataMat, dataLabel)
    joblib.dump(rf, path)
    return clf

 
if __name__ == '__main__':
    # clf = svm.SVC(decision_function_shape='ovr')
    st = time.process_time()
    dataMat, dataLabel = read_all_data()
    #path = sys.path[1]
    #model_path=os.path.join(path,'model\\svm.model')
    model_path = 'model/svm.model'
    create_svm(dataMat, dataLabel, model_path, decision='ovr')
    et = time.process_time()
    print("Training spent {:.4f}s.".format((et - st)))



MNIST_data/train/0
0
1
2
3
4
5
6
7
8
9
