In [0]:
pip install scikit-learn

In [0]:
import numpy as np
#path = "https://github.com/PacktPublishing/Learning-Data-Mining-with-Python-Second-Edition/blob/master/Chapter01/affinity_dataset.txt"
dataset_filename = "https://raw.githubusercontent.com/PacktPublishing/Learning-Data-Mining-with-Python-Second-Edition/master/Chapter01/affinity_dataset.txt"
X = np.loadtxt(dataset_filename)

In [0]:
print(X)

In [76]:
n_samples, n_features = X.shape
print(X[:5])

[[0. 1. 0. 0. 0.]
 [1. 1. 0. 0. 0.]
 [0. 0. 1. 0. 1.]
 [1. 1. 0. 0. 0.]
 [0. 0. 1. 1. 1.]]


In [0]:
features = ["bread","milk","cheese","apples","bananas"]

In [77]:
num_apple_purchases = 0
for sample in X:
    if sample[3] == 1:  
        num_apple_purchases += 1
print("{0} people bought Apples".format(num_apple_purchases))

43 people bought Apples


In [78]:
num_milk_purchases = 0
for sample in X:
    if sample[1] == 1:  
        num_milk_purchases += 1
print("{0} people bought Milk".format(num_milk_purchases))

52 people bought Milk


In [79]:
num_bread_purchases = 0
for sample in X:
    if sample[0] == 1:  
        num_bread_purchases += 1
print("{0} people bought Bread".format(num_bread_purchases))

28 people bought Bread


In [80]:
rule_valid = 0
rule_invalid = 0
for sample in X:
    if sample[3] == 1:  # This person bought Apples
        if sample[4] == 1:
            # This person bought both Apples and Bananas
            rule_valid += 1
        else:
            # This person bought Apples, but not Bananas
            rule_invalid += 1
print("{0} cases of the rule being valid were discovered".format(rule_valid))
print("{0} cases of the rule being invalid were discovered".format(rule_invalid))

27 cases of the rule being valid were discovered
16 cases of the rule being invalid were discovered


In [81]:
support = rule_valid  
confidence = rule_valid / num_apple_purchases
print("The support is {0} and the confidence is {1:.3f}.".format(support, confidence))
print("As a percentage, that is {0:.1f}%.".format(100 * confidence))

The support is 27 and the confidence is 0.628.
As a percentage, that is 62.8%.


In [0]:
from collections import defaultdict

valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0: continue
        
        num_occurences[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:  
                continue
            if sample[conclusion] == 1:
                
                valid_rules[(premise, conclusion)] += 1
            else:
                
                invalid_rules[(premise, conclusion)] += 1
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]

In [83]:
for premise, conclusion in confidence:
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

Rule: If a person buys bread they will also buy milk
 - Confidence: 0.464
 - Support: 13

Rule: If a person buys milk they will also buy bread
 - Confidence: 0.250
 - Support: 13

Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.513
 - Support: 20

Rule: If a person buys bananas they will also buy cheese
 - Confidence: 0.351
 - Support: 20

Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.564
 - Support: 22

Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.512
 - Support: 22

Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.628
 - Support: 27

Rule: If a person buys bananas they will also buy apples
 - Confidence: 0.474
 - Support: 27

Rule: If a person buys milk they will also buy apples
 - Confidence: 0.346
 - Support: 18

Rule: If a person buys apples they will also buy milk
 - Confidence: 0.419
 - Support: 18

Rule: If a person buys milk they will also buy bananas
 - Confidence: 0.519


In [0]:
def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

In [86]:
premise = 1
conclusion = 3
print_rule(premise, conclusion, support, confidence, features)

Rule: If a person buys milk they will also buy apples
 - Confidence: 0.346
 - Support: 18



In [87]:
from pprint import pprint
pprint(list(support.items()))

[((0, 1), 13),
 ((1, 0), 13),
 ((2, 4), 20),
 ((4, 2), 20),
 ((2, 3), 22),
 ((3, 2), 22),
 ((3, 4), 27),
 ((4, 3), 27),
 ((1, 3), 18),
 ((3, 1), 18),
 ((1, 4), 27),
 ((4, 1), 27),
 ((0, 2), 5),
 ((2, 0), 5),
 ((0, 4), 16),
 ((4, 0), 16),
 ((1, 2), 11),
 ((2, 1), 11),
 ((0, 3), 9),
 ((3, 0), 9)]


In [0]:
from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)

In [89]:
for index in range(8):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

Rule #1
Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.628
 - Support: 27

Rule #2
Rule: If a person buys bananas they will also buy apples
 - Confidence: 0.474
 - Support: 27

Rule #3
Rule: If a person buys milk they will also buy bananas
 - Confidence: 0.519
 - Support: 27

Rule #4
Rule: If a person buys bananas they will also buy milk
 - Confidence: 0.474
 - Support: 27

Rule #5
Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.564
 - Support: 22

Rule #6
Rule: If a person buys apples they will also buy cheese
 - Confidence: 0.512
 - Support: 22

Rule #7
Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.513
 - Support: 20

Rule #8
Rule: If a person buys bananas they will also buy cheese
 - Confidence: 0.351
 - Support: 20



In [0]:
sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

In [92]:
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)


Rule #1
Rule: If a person buys apples they will also buy bananas
 - Confidence: 0.628
 - Support: 27

Rule #2
Rule: If a person buys bread they will also buy bananas
 - Confidence: 0.571
 - Support: 16

Rule #3
Rule: If a person buys cheese they will also buy apples
 - Confidence: 0.564
 - Support: 22

Rule #4
Rule: If a person buys milk they will also buy bananas
 - Confidence: 0.519
 - Support: 27

Rule #5
Rule: If a person buys cheese they will also buy bananas
 - Confidence: 0.513
 - Support: 20



In [93]:
pip install sklearn



In [68]:
import numpy as np
# Load our dataset
#from sklearn.datasets import load_iris
#X, y = np.loadrom sklearn.dattxt("X_classification.txt"), np.loadtxt("y_classification.txt")
#dataset = load_iris()
#X = dataset.data
#y = dataset.target
#print(dataset.DESCR)
n_samples, n_features = X.shape

attribute_means = X.mean(axis=0)
assert attribute_means.shape == (n_features,)
X_d = np.array(X >= attribute_means, dtype='int')

from sklearn.model_selection import train_test_split

# Set the random state to the same number to get the same results as in the book
random_state = 14

X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)
print("There are {} training samples".format(y_train.shape))
print("There are {} testing samples".format(y_test.shape))



There are (112,) training samples
There are (38,) testing samples


In [0]:
def train(X, y_true, feature):
    """Computes the predictors and error for a given feature using the OneR algorithm
    
    Parameters
    ----------
    X: array [n_samples, n_features]
        The two dimensional array that holds the dataset. Each row is a sample, each column
        is a feature.
    
    y_true: array [n_samples,]
        The one dimensional array that holds the class values. Corresponds to X, such that
        y_true[i] is the class value for sample X[i].
    
    feature: int
        An integer corresponding to the index of the variable we wish to test.
        0 <= variable < n_features
        
    Returns
    -------
    predictors: dictionary of tuples: (value, prediction)
        For each item in the array, if the variable has a given value, make the given prediction.
    
    error: float
        The ratio of training data that this rule incorrectly predicts.
    """
    # Check that variable is a valid number
    n_samples, n_features = X.shape
    assert 0 <= feature < n_features
    # Get all of the unique values that this variable has
    values = set(X[:,feature])
    # Stores the predictors array that is returned
    predictors = dict()
    errors = []
    for current_value in values:
        most_frequent_class, error = train_feature_value(X, y_true, feature, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)
    # Compute the total error of using this feature to classify on
    total_error = sum(errors)
    return predictors, total_error

In [0]:
def train_feature_value(X, y_true, feature, value):
    # Create a simple dictionary to count how frequency they give certain predictions
    class_counts = defaultdict(int)
    # Iterate through each sample and count the frequency of each class/value pair
    for sample, y in zip(X, y_true):
        if sample[feature] == value:
            class_counts[y] += 1
    # Now get the best one by sorting (highest first) and choosing the first item
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
    # The error is the number of samples that do not classify as the most frequent class
    # *and* have the feature value.
    n_samples = X.shape[1]
    error = sum([class_count for class_value, class_count in class_counts.items()
                 if class_value != most_frequent_class])
    return most_frequent_class, error


In [71]:
# Compute all of the predictors
all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}
# Now choose the best and save that as "model"
# Sort by error
best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("The best model is based on variable {0} and has error {1:.2f}".format(best_variable, best_error))

The best model is based on variable 2 and has error 37.00


In [72]:
model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]}
print(model)

{'variable': 2, 'predictor': {0: 0, 1: 2}}


In [73]:
def predict(X_test, model):
    variable = model['variable']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted

y_predicted = predict(X_test, model)
print(y_predicted)


[0 0 0 2 2 2 0 2 0 2 2 0 2 2 0 2 0 2 2 2 0 0 0 2 0 2 0 2 2 0 0 0 2 0 2 0 2
 2]


In [74]:
accuracy = np.mean(y_predicted == y_test) * 100
print("The test accuracy is {:.1f}%".format(accuracy))


from sklearn.metrics import classification_report

print(classification_report(y_test, y_predicted))

The test accuracy is 65.8%
              precision    recall  f1-score   support

           0       0.94      1.00      0.97        17
           1       0.00      0.00      0.00        13
           2       0.40      1.00      0.57         8

    accuracy                           0.66        38
   macro avg       0.45      0.67      0.51        38
weighted avg       0.51      0.66      0.55        38



  _warn_prf(average, modifier, msg_start, len(result))


# **little analysis project**



In [0]:
import numpy as np
import pandas as pd

dataset_filename = "data_url"
X = np.loadtxt(dataset_filename)

#X = pd.read_csv(dataset_filename)

n_samples, n_features = X.shape

features = str(input("please input the the name of items: "))

#features = X.head()


valid_rules = defaultdict(int)
invalid_rules = defaultdict(int)
num_occurences = defaultdict(int)

for sample in X:
    for premise in range(n_features):
        if sample[premise] == 0: continue
        
        num_occurences[premise] += 1
        for conclusion in range(n_features):
            if premise == conclusion:  
                continue
            if sample[conclusion] == 1:
               
                valid_rules[(premise, conclusion)] += 1
            else:
               
                invalid_rules[(premise, conclusion)] += 1
support = valid_rules
confidence = defaultdict(float)
for premise, conclusion in valid_rules.keys():
    confidence[(premise, conclusion)] = valid_rules[(premise, conclusion)] / num_occurences[premise]


def print_rule(premise, conclusion, support, confidence, features):
    premise_name = features[premise]
    conclusion_name = features[conclusion]
    print("Rule: If a person buys {0} they will also buy {1}".format(premise_name, conclusion_name))
    print(" - Confidence: {0:.3f}".format(confidence[(premise, conclusion)]))
    print(" - Support: {0}".format(support[(premise, conclusion)]))
    print("")

print("search for the perticular rule")
print("")
premise = input("please input the premise: ")
conclusion = input("please input the conclusion: ")
print_rule(premise, conclusion, support, confidence, features)

print("the list of shopping behaviors")
print("")
from pprint import pprint
pprint(list(support.items()))
print("")


from operator import itemgetter
sorted_support = sorted(support.items(), key=itemgetter(1), reverse=True)

print("This is a list of rules rank by supports")
print("")
rules_num = input("how many rules you want to see? ")

for index in range(int(rules_num)):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_support[index][0]
    print_rule(premise, conclusion, support, confidence, features)

sorted_confidence = sorted(confidence.items(), key=itemgetter(1), reverse=True)

print("This is a list of rules rank by confidence")
print("")
rules_num = input("how many rules you want to see? ")
print("")
for index in range((int(rules_num)):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print_rule(premise, conclusion, support, confidence, features)

print("")
from matplotlib import pyplot as plt 
print("This is a graph")
rule_num = int(input("which rule you want to see? "))
print("")
plt.plot([confidence[rule[rule_num]] for rule in sorted_confidence)


# **little predict project**

In [55]:
import numpy as np

n_samples, n_features = X.shape

attribute_means = X.mean(axis=0)
assert attribute_means.shape == (n_features,)
X_d = np.array(X >= attribute_means, dtype='int')

from sklearn.model_selection import train_test_split

rs = int(input("please input the random state"))
random_state = rs

X_train, X_test, y_train, y_test = train_test_split(X_d, y, random_state=random_state)
print("There are {} training samples".format(y_train.shape))
print("There are {} testing samples".format(y_test.shape)


#from collections import defaultdict
#from operator import itemgetter


def train(X, y_true, feature):
 
    n_samples, n_features = X.shape
    assert 0 <= feature < n_features
    
    values = set(X[:,feature])

    predictors = dict()
    errors = []
    for current_value in values:
        most_frequent_class, error = train_feature_value(X, y_true, feature, current_value)
        predictors[current_value] = most_frequent_class
        errors.append(error)

  total_error = sum(errors)
  return predictors, total_error


def train_feature_value(X, y_true, feature, value):
  
    class_counts = defaultdict(int)
  
    for sample, y in zip(X, y_true):
        if sample[feature] == value:
            class_counts[y] += 1
  
    sorted_class_counts = sorted(class_counts.items(), key=itemgetter(1), reverse=True)
    most_frequent_class = sorted_class_counts[0][0]
  
  
    n_samples = X.shape[1]
    error = sum([class_count for class_value, class_count in class_counts.items()
                 if class_value != most_frequent_class])
    return most_frequent_class, error


all_predictors = {variable: train(X_train, y_train, variable) for variable in range(X_train.shape[1])}
errors = {variable: error for variable, (mapping, error) in all_predictors.items()}

best_variable, best_error = sorted(errors.items(), key=itemgetter(1))[0]
print("The best model is based on variable {0} and has error {1:.2f}".format(best_variable, best_error))

model = {'variable': best_variable,
         'predictor': all_predictors[best_variable][0]}
print(model)

def predict(X_test, model):
    variable = model['variable']
    predictor = model['predictor']
    y_predicted = np.array([predictor[int(sample[variable])] for sample in X_test])
    return y_predicted

y_predicted = predict(X_test, model)
print(y_predicted)


# Compute the accuracy by taking the mean of the amounts that y_predicted is equal to y_test
accuracy = np.mean(y_predicted == y_test) * 100
print("The test accuracy is {:.1f}%".format(accuracy))

from sklearn.metrics import classification_report

print(classification_report(y_test, y_predicted))

SyntaxError: ignored