# 1.3 亲和性分析示例

## 1.3.3 在NumPy中加载数据集

In [1]:
# new version: 使用 Pandas 读取
import pandas as pd

features = ["bread", "milk", "cheese", "apples", "bananas"]
X_new = pd.read_csv("./affinity_dataset.txt", sep=" ", names=features)
n_samples, n_features = X_new.shape
X_new[:5]

Unnamed: 0,bread,milk,cheese,apples,bananas
0,0,0,1,1,1
1,1,1,0,1,0
2,1,0,1,1,0
3,0,0,1,1,1
4,0,1,0,0,1


In [1]:
import numpy as np

dataset_filename = "./affinity_dataset.txt"
X = np.loadtxt(dataset_filename)
n_samples, n_features = X.shape
features = ["bread", "milk", "cheese", "apples", "bananas"]
X[:5]

[[0. 0. 1. 1. 1.]
 [1. 1. 0. 1. 0.]
 [1. 0. 1. 1. 0.]
 [0. 0. 1. 1. 1.]
 [0. 1. 0. 0. 1.]]


## 1.3.4 实现简单的排序规则

- **支持度**：数据集中规则鹰眼的次数

- **置信度**：衡量规则准确率

In [2]:
# new version
print("{0} people bought Apples.".format(X_new["apples"].sum()))

36 people bought Apples.


In [3]:
num_apple_purchases = 0

for sample in X:
    if sample[3] == 1:
        num_apple_purchases += 1

print(f"{num_apple_purchases} people bought Apples.")

36 people bought Apples.


In [3]:
# new version
rule_valid = X_new.loc[(X_new['apples'] == 1) & (X_new['bananas'] == 1)].shape[0]
rule_invalid = X_new.loc[(X_new['apples'] == 1) & (X_new['bananas'] == 0)].shape[0]

print(f"{rule_valid} cases of the rule being valid were discovered.")
print(f"{rule_invalid} cases of the rule being invalid were discovered.")

21 cases of the rule being valid were discovered.
15 cases of the rule being invalid were discovered.


In [2]:
# 买了苹果也买了香蕉
rule_valid = 0
rule_invalid = 0

for sample in X:
    if sample[3] == 1:
        if sample[4] == 1:
            rule_valid += 1
        else:
            rule_invalid += 1

print(f"{rule_valid} cases of the rule being valid were discovered.")
print(f"{rule_invalid} cases of the rule being invalid were discovered.")

21 cases of the rule being valid were discovered.
15 cases of the rule being invalid were discovered.


In [6]:
support = rule_valid
confidence = rule_valid / num_apple_purchases

print(f"The support is {support} and the confidence is {confidence:.3f}")
print(f"As a percentage, that is {100 * confidence:.1f}%.")

The support is 21 and the confidence is 0.583
As a percentage, that is 58.3%.


In [2]:
from collections import defaultdict

valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(4):
        if sample[premise] == 0: continue

        num_occurences[premise] += 1

        for conclusion in range(n_features):
            if premise == conclusion: continue
        
            if sample[conclusion] == 1:
                valid_rules[(premise, conclusion)] += 1
            else:
                invalid_rules[(premise, conclusion)] += 1

support = valid_rules
confidence = defaultdict(float)

for premise, conclusion in valid_rules.keys():
    rule = (premise, conclusion)
    confidence[rule] = valid_rules[rule] / num_occurences[premise]

In [4]:
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print(f"Rule: If a preson buys {premise_name} they will also buy {conclusion_name}")
    print(" - Support: {0}".format(support[premise, conclusion]))
    print(" - Confidence: {0:.3f}".format(confidence[premise, conclusion]))

In [9]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: If a preson buys milk they will also buy apples
 - Support: 9
 - Confidence: 0.196


In [5]:
## 1.3.5 排序找出最佳规则
from operator import itemgetter

sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)

for idx in range(5):
    print(f"Rule #{idx + 1}")
    premise, conclusion = sorted_support[idx][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a preson buys cheese they will also buy bananas
 - Support: 27
 - Confidence: 0.659
Rule #2
Rule: If a preson buys cheese they will also buy apples
 - Support: 25
 - Confidence: 0.610
Rule #3
Rule: If a preson buys apples they will also buy cheese
 - Support: 25
 - Confidence: 0.694
Rule #4
Rule: If a preson buys apples they will also buy bananas
 - Support: 21
 - Confidence: 0.583
Rule #5
Rule: If a preson buys milk they will also buy bananas
 - Support: 19
 - Confidence: 0.413


In [6]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

for idx in range(5):
    print(f"Rule #{idx + 1}")
    premise, conclusion = sorted_confidence[idx][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a preson buys apples they will also buy cheese
 - Support: 25
 - Confidence: 0.694
Rule #2
Rule: If a preson buys cheese they will also buy bananas
 - Support: 27
 - Confidence: 0.659
Rule #3
Rule: If a preson buys bread they will also buy bananas
 - Support: 17
 - Confidence: 0.630
Rule #4
Rule: If a preson buys cheese they will also buy apples
 - Support: 25
 - Confidence: 0.610
Rule #5
Rule: If a preson buys apples they will also buy bananas
 - Support: 21
 - Confidence: 0.583


# 1.4 分类问题的简单示例

# 1.5 什么是分类

## 1.5.1 准备数据集

In [1]:
import numpy as np
from sklearn.datasets import load_iris

In [9]:
dataset = load_iris()
X = dataset.data
y = dataset.target
print(dataset.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [10]:
attributes_mean = X.mean(axis=0)
attributes_mean

array([5.84333333, 3.05733333, 3.758     , 1.19933333])

In [15]:
X_d = np.array(X >= attributes_mean, dtype='int')
X_d[:5]

array([[0, 1, 0, 0],
       [0, 0, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0],
       [0, 1, 0, 0]])

## 1.5.2 实现 OneR 算法

In [16]:
from collections import defaultdict
from operator import itemgetter


def train_feature_value(X, y_true, feature_index, value):
    """使用 OneR 算法，对于给定特征，计算 predictors 和 error

    Args:
        X ([ndarray]): 训练集
        y_true ([ndarray]): 真实 Label
        feature_index ([int]): 特征索引
        value ([type]): 特征值

    Returns:
        [tuple]: 返回频率最多的 class 和 error
    """
    class_counts = defaultdict(int)

    for sample, y in zip(X, y_true):
        if sample[feature_index] == value:
            class_counts[y] += 1

    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]

    # 计算该条规则的错误率
    incorrect_predictions = [class_count for class_value, class_count in class_counts.items()
                                        if class_value != most_frequent_class]
    error = sum(incorrect_predictions)

    return most_frequent_class, error

In [56]:
def train_on_feature(X, y_true, feature_index):
    values = set(X[:, feature_index])
    predictors, errors = {}, []

    for current_value in values:
        most_frequent_class, error = train_feature_value(X, y_true, feature_index, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)

    total_error = sum(errors)
    return predictors, total_error

## 1.5.3 测试算法

In [63]:
from sklearn.model_selection import train_test_split


Xd_train, Xd_test, y_train, y_test = train_test_split(X_d, y, random_state=14)
all_predictors, errors = {}, {}

for feature_index in range(Xd_train.shape[1]):
    predcitors, total_error = train_on_feature(Xd_train, y_train, feature_index)
    all_predictors[feature_index] = (predcitors, total_error)
    errors[feature_index] = total_error

best_feature, best_error = sorted(errors.items(), key=itemgetter(1))[0]
model = {'variable': best_feature, 'predictor': all_predictors[best_feature][0]}

In [65]:
def predict(X_test, model):

    variable = model['variable']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted

In [66]:
y_predicted = predict(Xd_test, model)
accuracy = np.mean(y_predicted == y_test) * 100
print(f"The test accuracy is {accuracy:.1f}%")

The test accuracy is 65.8%
