KNN算法中将文本数据转化为NumPy的程序

In [2]:
def file2matrix(filename):
    fr = open(filename)
    # 获取文件行数
    lines = len(fr.readlines)
    # 生成相应规格的空矩阵
    # zeros(2,3)是2*3类型的0矩阵,本次试验对应的列数为3
    returnMat = zeros((lines, 3))
    # 生成类别标签
    classLabelVec = []
    index = 0
    for line in fr.readlines:
        # 移除头尾空格
        line = line.strip()
        # 切割字符串
        listFromLine = line.split('\t')
        # 每列的属性数据
        returnMat[index,:] = listFromLine[0:3]
        # 获取标签
        classLabelVec.append(int(listFromLine[-1]))
        index +=1
    return returnMat, classLabelVec

数据归一化处理：消除特征之间量级不同导致的影响

In [3]:
def nomalData(dataSet):
    # 计算最大值最小值范围
    #每列最小值，是个行向量
    minValue = dataSet.min(0)
    maxValue = dataSet.max(0)
    # 计算最大差值，每列最大差值
    range = maxValue - minValue
    normalDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    # 生成最小值之差组成的矩阵
    # tile(minValue, (m, 1)) 生成minValue行向量，的m倍行向量
    normalDataSet = dataSet - tile(minValue, (m, 1))
    # 将最小值之差除以范围组成矩阵
    normalDataSet = normalDataSet / tile(range, (m, 1))
    return normalDataSet, range, minValue

KNN算法：
       对于每一个数据集中的数据点：
       计算目标数据点（待分类的数据点）与该数据点的距离
       将距离排序：从小到大
       选取K个最短距离
       选取这K个类别中最多的分类类别
       返回该类别来作为目标数据点的预测值。

In [5]:
def classify0(intX, dateSet, labels, k):
    dataSetSize = dataSet.shape[0]
    # 距离度量，欧氏距离
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    sqDiffMat = diffMat**2
    # 行相加
    sqDistance = sqDiffMat.sum(axis=1)
    distances = sqDistance**0.5
    # 距离排序
    sortedDistance = distances.argsort()
    # 选取前k个最短距离,并投票得出K个类别最多的类
    classCount = {}
    for i in range(k):
        voteLabel = labels[sortedDistance[i]]
        classCount[voteLabel] = classCount.get(voteLabel, 0) + 1
    sortedClassCount = sorted(classCount.iteriterms(), key = operator.iteriterms(1), reverse=True)
    return sortedClassCount[0][0]
     

In [9]:
def datingClassTest():
    """
    Desc:
        对约会网站的测试方法
    parameters:
        none
    return:
        错误数
    """
    # 设置测试数据的的一个比例（训练数据集比例=1-hoRatio）
    hoRatio = 0.1  # 测试范围,一部分测试一部分作为样本
    # 从文件中加载数据
    datingDataMat, datingLabels = file2matrix('data/2.KNN/knnDataSet.txt')  # load data setfrom file
    # 归一化数据
    normMat, ranges, minVals = autoNorm(datingDataMat)
    # m 表示数据的行数，即矩阵的第一维
    m = normMat.shape[0]
    # 设置测试的样本数量， numTestVecs:m表示训练样本的数量
    numTestVecs = int(m * hoRatio)
    print ('numTestVecs=', numTestVecs)
    errorCount = 0.0
    for i in range(numTestVecs):
        # 对数据测试
        classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs:m], 3)
        print ("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
        if (classifierResult != datingLabels[i]): errorCount += 1.0
    print ("the total error rate is: %f" % (errorCount / float(numTestVecs)))
    print (errorCount)
    

In [10]:
def classifyPerson():
    resultList = ['not at all', 'in small doses', 'in large doses']
    percentTats = float(raw_input("percentage of time spent playing video games ?"))
    ffMiles = float(raw_input("frequent filer miles earned per year?"))
    iceCream = float(raw_input("liters of ice cream consumed per year?"))
    datingDataMat, datingLabels = file2matrix('knnDataSet.txt')
    normMat, ranges, minVals = autoNorm(datingDataMat)
    inArr = array([ffMiles, percentTats, iceCream])
    classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels, 3)
    print ("You will probably like this person: ", resultList[classifierResult - 1])