In [139]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#### [Problem 1] Apriori algorithm

In [142]:
from itertools import combinations
from collections import Counter

In [144]:
items = [
    [2, 3, 4, 5, 6, 8],
    [1, 2, 3, 5, 6],
    [1, 4, 5, 7, 8],
    [2, 3, 4, 5, 6],
    [1, 2, 3, 4, 5, 7],
    [1, 3, 8]
]

items_list = [set(i) for i in items]
print(items_list)

[{2, 3, 4, 5, 6, 8}, {1, 2, 3, 5, 6}, {1, 4, 5, 7, 8}, {2, 3, 4, 5, 6}, {1, 2, 3, 4, 5, 7}, {8, 1, 3}]


##### [1.1] All candidate itemsets and frequent itemsets

In [147]:
class Apriori:
    def __init__(self, items_list, min_support):
        self.items_list = items_list
        self.min_support = min_support
        self.frequent_itemsets = []
        self.candidates = []

    #L1 and C1 values generated
    def L1_C1_values(self):
        item_counts = Counter(item for i in self.items_list for item in i)
        C1 = {(item,): count for item, count in item_counts.items() }
        L1 = {itemset: count for itemset, count in C1.items() if count >= self.min_support}
        return C1, L1

    #candidate k-itemsets from previous frequent itemsets
    def candidate_itemsets(self, prev_freq, k):
        items = sorted(set(item for subset in prev_freq for item in subset))
        return [tuple(sorted(c)) for c in combinations(items, k)]

    #counting number of times each candidate appears in items_list
    def candidates_count(self, candidates):
        counter = Counter()
        for t in self.items_list:
            for candidate in candidates:
                if set(candidate).issubset(t):
                    counter[candidate] += 1
        return dict(counter)

    #keep items in itemset that are >= minimum_support
    def most_freq(self, candidate_counts):
        return {itemset: count for itemset, count in candidate_counts.items() if count >= self.min_support}

    #to run algorithm
    def apriori_algo(self):
        C1, L1 = self.L1_C1_values()
        self.candidates.append(C1)
        self.frequent_itemsets.append(L1)

        k = 2
        current_L = L1
        while current_L:
            candidates_k = self.candidate_itemsets(current_L, k)
            Ck = self.candidates_count(candidates_k)
            Lk = self.most_freq(Ck)

            if not Lk:
                break
            self.candidates.append(Ck)
            self.frequent_itemsets.append(Lk)
            current_L = Lk
            k += 1

    #print itemsets for both
    def candidate_frequent(self):
        for i in range(len(self.frequent_itemsets)):
            print(f"\nC{i+1}:")
            for itemset, count in sorted(self.candidates[i].items()):
                print(f"{itemset}: {count}")

            print(f"\nL{i+1}:")
            for itemset, count in sorted(self.frequent_itemsets[i].items()):
                print(f"{itemset}: {count}")

In [149]:
#run algorithm
apriori = Apriori(items_list, min_support=3)
apriori.apriori_algo()
apriori.candidate_frequent()


C1:
(1,): 4
(2,): 4
(3,): 5
(4,): 4
(5,): 5
(6,): 3
(7,): 2
(8,): 3

L1:
(1,): 4
(2,): 4
(3,): 5
(4,): 4
(5,): 5
(6,): 3
(8,): 3

C2:
(1, 2): 2
(1, 3): 3
(1, 4): 2
(1, 5): 3
(1, 6): 1
(1, 8): 2
(2, 3): 4
(2, 4): 3
(2, 5): 4
(2, 6): 3
(2, 8): 1
(3, 4): 3
(3, 5): 4
(3, 6): 3
(3, 8): 2
(4, 5): 4
(4, 6): 2
(4, 8): 2
(5, 6): 3
(5, 8): 2
(6, 8): 1

L2:
(1, 3): 3
(1, 5): 3
(2, 3): 4
(2, 4): 3
(2, 5): 4
(2, 6): 3
(3, 4): 3
(3, 5): 4
(3, 6): 3
(4, 5): 4
(5, 6): 3

C3:
(1, 2, 3): 2
(1, 2, 4): 1
(1, 2, 5): 2
(1, 2, 6): 1
(1, 3, 4): 1
(1, 3, 5): 2
(1, 3, 6): 1
(1, 4, 5): 2
(1, 5, 6): 1
(2, 3, 4): 3
(2, 3, 5): 4
(2, 3, 6): 3
(2, 4, 5): 3
(2, 4, 6): 2
(2, 5, 6): 3
(3, 4, 5): 3
(3, 4, 6): 2
(3, 5, 6): 3
(4, 5, 6): 2

L3:
(2, 3, 4): 3
(2, 3, 5): 4
(2, 3, 6): 3
(2, 4, 5): 3
(2, 5, 6): 3
(3, 4, 5): 3
(3, 5, 6): 3

C4:
(2, 3, 4, 5): 3
(2, 3, 4, 6): 2
(2, 3, 5, 6): 3
(2, 4, 5, 6): 2
(3, 4, 5, 6): 2

L4:
(2, 3, 4, 5): 3
(2, 3, 5, 6): 3


##### [1.2] Strong Rules with format {W, X} => {Y, Z}

In [152]:
L4 = apriori.frequent_itemsets[3]
print(L4)

{(2, 3, 4, 5): 3, (2, 3, 5, 6): 3}


In [154]:
#sort by item number/lexographically
sorted_L4 = sorted(L4.keys())
print(sorted_L4)

[(2, 3, 4, 5), (2, 3, 5, 6)]


In [156]:
#get the first itemset of the sorted itemsets
if sorted_L4:
    first_4itemset = sorted_L4[0]
    print("First frequent 4-itemset:", {first_4itemset})
else:
    print("No frequent 4-itemsets found.")

First frequent 4-itemset: {(2, 3, 4, 5)}


In [158]:
#mine strong rules
min_confidence = 0.8
items_list = apriori.items_list 

def suppcount(itemset):
    return sum(1 for t in items_list if set(itemset).issubset(t))

if sorted_L4:
    itemset = first_4itemset
    items = list(itemset)
    found = False
    print("All Strong Rules that have the form {W, X} => {Y, Z} & minimum confidence of 80%:")
    for prev in combinations(items, 2):
        next_elem = tuple(sorted(set(items) - set(prev)))
        support_prev = suppcount(prev)
        support_both = suppcount(itemset)
        if support_prev:
            confidence = support_both / support_prev
        else:
            confidence = 0
        if confidence >= min_confidence:
            print(f"{set(prev)} => {set(next_elem)} (confidence = {confidence:.2f})")
            found = True
    if not found:
        print("No strong rules found.")

All Strong Rules that have the form {W, X} => {Y, Z} & minimum confidence of 80%:
{2, 4} => {3, 5} (confidence = 1.00)
{3, 4} => {2, 5} (confidence = 1.00)


#### [Problem 2]

In [161]:
bas = pd.read_csv('basketanalysis.csv')
bas.head()

Unnamed: 0.1,Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


##### [2.1] Mining Rules w/minimum support and minimum confidence 

In [164]:
bas = bas.iloc[:, 1:]
bas.head()

Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [166]:
#converting to boolean values
bas = bas.map(lambda x: True if str(x).strip().lower() == "true" else False)
bas.head()

Unnamed: 0,Apple,Bread,Butter,Cheese,Corn,Dill,Eggs,Ice cream,Kidney Beans,Milk,Nutmeg,Onion,Sugar,Unicorn,Yogurt,chocolate
0,False,True,False,False,True,True,False,True,False,False,False,False,True,False,True,True
1,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,True,False,True,False,False,True,False,True,False,True,False,False,False,False,True,True
3,False,False,True,True,False,True,False,False,False,True,True,True,False,False,False,False
4,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [168]:
from mlxtend.frequent_patterns import apriori, association_rules

In [170]:
#mine frequent items w/min_support
frequent_itemsets = apriori(bas, min_support=0.15, use_colnames=True)
print(frequent_itemsets)

      support              itemsets
0    0.383383               (Apple)
1    0.384384               (Bread)
2    0.420420              (Butter)
3    0.404404              (Cheese)
4    0.407407                (Corn)
..        ...                   ...
131  0.191191       (Yogurt, Sugar)
132  0.188188    (Sugar, chocolate)
133  0.184184     (Yogurt, Unicorn)
134  0.186186  (Unicorn, chocolate)
135  0.198198   (Yogurt, chocolate)

[136 rows x 2 columns]


In [172]:
#mine association rules w/min_confidence
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
print(rules)

   antecedents  consequents  antecedent support  consequent support   support  \
0      (Bread)     (Yogurt)            0.384384            0.420420  0.193193   
1  (Ice cream)     (Butter)            0.410410            0.420420  0.207207   
2       (Dill)  (chocolate)            0.398398            0.421421  0.199199   
3  (chocolate)       (Milk)            0.421421            0.405405  0.211211   
4       (Milk)  (chocolate)            0.405405            0.421421  0.211211   

   confidence      lift  representativity  leverage  conviction  \
0    0.502604  1.195480               1.0  0.031590    1.165228   
1    0.504878  1.200889               1.0  0.034662    1.170579   
2    0.500000  1.186461               1.0  0.031306    1.157157   
3    0.501188  1.236263               1.0  0.040365    1.192021   
4    0.520988  1.236263               1.0  0.040365    1.207857   

   zhangs_metric   jaccard  certainty  kulczynski  
0       0.265614  0.315876   0.141799    0.481064  
1     

In [174]:
#the number of rules mined
print("Number of rules mined:", len(rules))

Number of rules mined: 5


##### [2.2] Rules w/highest confidence

In [177]:
#rule w/highest confidence
max_confidence = rules.loc[rules['confidence'].idxmax()]
print(max_confidence)

antecedents                (Milk)
consequents           (chocolate)
antecedent support       0.405405
consequent support       0.421421
support                  0.211211
confidence               0.520988
lift                     1.236263
representativity              1.0
leverage                 0.040365
conviction               1.207857
zhangs_metric            0.321413
jaccard                  0.343089
certainty                0.172088
kulczynski               0.511088
Name: 4, dtype: object


In [179]:
#Capture the portion of the output that states the rule, as well as the support, confidence, coverage, and lift.
print("Rule with the highest confidence:")
print(f"Rule: {set(max_confidence['antecedents'])} => {set(max_confidence['consequents'])}")
print(f"Support: {max_confidence['support']:.3f}")
print(f"Confidence: {max_confidence['confidence']:.3f}")
print(f"Coverage: {max_confidence['support']/max_confidence['confidence']:.3f}")
print(f"Lift: {max_confidence['lift']:.3f}")

Rule with the highest confidence:
Rule: {'Milk'} => {'chocolate'}
Support: 0.211
Confidence: 0.521
Coverage: 0.405
Lift: 1.236


##### [2.3] Number of transactions

In [182]:
num_transactions = len(bas)

antecedent = max_confidence['antecedents']
consequent = max_confidence['consequents']

#transactions w/both antecedents and consequents
both_antcon = bas[list(antecedent | consequent)].all(axis=1).sum()

#transactions w/antecedent only
ant = bas[list(antecedent)].all(axis=1).sum()

#transactions w/consequent only
con = bas[list(consequent)].all(axis=1).sum()

In [184]:
#coverage, support, confidence, and lift for the rule
coverage = ant/num_transactions
support = both_antcon/num_transactions

if coverage > 0:
    confidence = support/coverage 
else:
    confidence = 0

if con > 0:
    lift = confidence/(con/num_transactions) 
else:
    lift = 0

In [186]:
#print results
print(f"Number of transactions: {num_transactions}")
print(f"Transactions with both antecedent and consequent: {both_antcon}")
print(f"Transactions with antecedent: {ant}")
print(f"Transactions with consequent: {con}")
print(f"Coverage: {coverage:.3f}")
print(f"Support: {support:.3f}")
print(f"Confidence: {confidence:.3f}")
print(f"Lift: {lift:.3f}")

Number of transactions: 999
Transactions with both antecedent and consequent: 211
Transactions with antecedent: 405
Transactions with consequent: 421
Coverage: 0.405
Support: 0.211
Confidence: 0.521
Lift: 1.236


#### [Problem 3] User-based collaborative filtering model

In [189]:
bookdf = pd.read_csv('bookratings-small.csv')
bookdf.head()

Unnamed: 0,User.ID,ISBN,Book.Rating
0,277427,002542730X,10
1,277427,003008685X,8
2,277427,0060006641,10
3,277427,0060542128,7
4,277427,0061009059,9


In [191]:
bookdf['User.ID'].value_counts()

User.ID
11676     8524
98391     5802
153662    1969
189835    1906
23902     1395
          ... 
195904      76
146230      76
26883       76
263733      76
174848      76
Name: count, Length: 754, dtype: int64

##### [3.1] realRatingMatrix and fit ubcf model

In [194]:
#realRatingMatrix mimic in python through sparse matrix
from scipy.sparse import coo_matrix

userMap = {u: i for i, u in enumerate(bookdf['User.ID'].unique())}
bookMap = {b: i for i, b in enumerate(bookdf['ISBN'].unique())}

rows = bookdf['User.ID'].map(userMap)
cols = bookdf['ISBN'].map(bookMap)
rating = bookdf['Book.Rating']

rr_matrix = coo_matrix((rating, (rows, cols)), shape=(len(userMap), len(bookMap))).tocsr()
rr_matrix

<754x88665 sparse matrix of type '<class 'numpy.int64'>'
	with 139984 stored elements in Compressed Sparse Row format>

In [196]:
#holding out 3 users randomly
holdouts = [277427, 276680, 195904]

train_mask = ~bookdf['User.ID'].isin(holdouts)
train_df = bookdf[train_mask]
test_df = bookdf[~train_mask]

In [198]:
#fit the model
from sklearn.neighbors import NearestNeighbors

rows_train = train_df['User.ID'].map(userMap)
cols_train = train_df['ISBN'].map(bookMap)
rating_train = train_df['Book.Rating']

matrix_train = coo_matrix((rating_train, (rows_train, cols_train)),shape=(len(userMap), len(bookMap))).tocsr()

cf = NearestNeighbors(n_neighbors=5, metric='cosine')
cf.fit(matrix_train)

##### [3.2] Making predictions on holdout users

In [201]:
#predict on holdouts
test_users = test_df['User.ID'].unique()
for user in test_users:
    idx = userMap[user]
    
    vec = rr_matrix[idx].toarray().reshape(1, -1)
    distances, indices = cf.kneighbors(vec)

    neighbor_ratings = matrix_train[indices[0]].mean(axis=0)

    predictions = {
        list(bookMap.keys())[i]: round(float(neighbor_ratings[0, i]),0)
        for i in range(neighbor_ratings.shape[1])
        if neighbor_ratings[0, i] > 0
    }
    print(f"Predictions for user {user}:")
    print(predictions)

Predictions for user 277427:
{'002542730X': 1.0, '0061009059': 2.0, '0316776963': 7.0, '0375751513': 1.0, '0380791978': 2.0, '0385504209': 1.0, '0399149562': 1.0, '0552137030': 1.0, '0679731148': 1.0, '0679736042': 2.0, '0743457943': 1.0, '0811802981': 3.0, '0836210263': 2.0, '0836217691': 2.0, '0836218515': 2.0, '0836218817': 4.0, '0836236688': 1.0, '0890876517': 1.0, '0345337662': 1.0, '0439064864': 2.0, '0439136350': 2.0, '043935806X': 3.0, '0590353403': 2.0, '0836204387': 1.0, '051512463X': 2.0, '0380002930': 1.0, '0440236673': 1.0, '0758201931': 1.0, '0060958022': 1.0, '0316666343': 2.0, '038039586X': 2.0, '0446310786': 3.0, '0553279912': 3.0, '0679751602': 2.0, '0767902521': 2.0, '0373250126': 1.0, '0385722435': 2.0, '0446672211': 2.0, '0804108293': 1.0, '0140011749': 2.0, '006000438X': 2.0, '014028009X': 2.0, '0156007754': 2.0, '0312150601': 2.0, '0345389247': 2.0, '0345433491': 2.0, '037570504X': 1.0, '0375705856': 2.0, '038533334X': 1.0, '0449002985': 1.0, '0449911004': 2.0, '

#### [Problem 4] A/B Testing

In [203]:
#number of products
products = [200, 250]
#number of defects
defects = [20, 10]

In [206]:
from statsmodels.stats.proportion import proportions_ztest

In [208]:
zval, pval = proportions_ztest(defects, products, alternative='larger')

print(f"z-statistic: {zval:.3f}, p-value: {pval:.5f}")

z-statistic: 2.535, p-value: 0.00561


In [210]:
if pval < 0.05:
    print(f"Since {pval:.5f} < 0.05, the reduction is statistically significant.")
else:
    print(f"Since {pval} > 0.05, the reduction is not statistically significant.")

Since 0.00561 < 0.05, the reduction is statistically significant.


#### [Problem 5] Random Forest

In [213]:
up_df = pd.read_csv('uplift-small.csv')
up_df.head()

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,6,329.08,1,1,Rural,1,Web,No Offer,No
1,4,241.42,0,1,Rural,1,Multichannel,No Offer,No
2,5,29.99,1,0,Surburban,0,Phone,Discount,No
3,9,112.35,1,0,Rural,0,Web,Discount,No
4,11,219.04,1,1,Surburban,0,Phone,Discount,No


In [215]:
up_df['offer'].value_counts()

offer
Discount    14098
No Offer    14025
Name: count, dtype: int64

In [217]:
from sklearn.model_selection import train_test_split

In [219]:
up_df['offer'] = up_df['offer'].map({'Discount': 1, 'No Offer': 0})
up_df['conversion'] = up_df['conversion'].map({'Yes': 1, 'No': 0})

up_df.head()

Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion
0,6,329.08,1,1,Rural,1,Web,0,0
1,4,241.42,0,1,Rural,1,Multichannel,0,0
2,5,29.99,1,0,Surburban,0,Phone,1,0
3,9,112.35,1,0,Rural,0,Web,1,0
4,11,219.04,1,1,Surburban,0,Phone,1,0


##### [5.1] training and holdout partitions on the data set.

In [222]:
#split training and test data
train, test = train_test_split(up_df, test_size=1/3, random_state=42, stratify=up_df['conversion'])

#not including target and treatment features
features = [col for col in up_df.columns if col not in ['conversion']]

# One-hot encode
X_train = pd.get_dummies(train[features], drop_first=True)
y_train = train['conversion']

##### [5.2] Fit Random Forest

In [225]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [227]:
param_grid = {'n_estimators': [100, 200],'max_depth': [5, 10, None],'min_samples_split': [2, 5]}

rf = RandomForestClassifier(random_state = 42)
grid = GridSearchCV(rf, param_grid, cv=3, scoring='roc_auc', n_jobs=-1)
grid.fit(X_train, y_train)

best_rf = grid.best_estimator_

In [228]:
#predictions on one with the offer (treatment) feature set to Discount
test_discount = test.copy()
test_discount['offer'] = 1
X_test_discount = pd.get_dummies(test_discount[features], drop_first=True)
X_test_discount = X_test_discount.reindex(columns=X_train.columns, fill_value=0)

In [229]:
#predictions on one with the offer feature set to No Offer
test_nooffer = test.copy()
test_nooffer['offer'] = 0
X_test_nooffer = pd.get_dummies(test_nooffer[features], drop_first=True)
X_test_nooffer = X_test_nooffer.reindex(columns=X_train.columns, fill_value=0)

In [230]:
#probabilities
prob_discount = best_rf.predict_proba(X_test_discount)[:, 1]
prob_nooffer = best_rf.predict_proba(X_test_nooffer)[:, 1]

print(prob_discount)
print(prob_nooffer)

[0.14452123 0.14735024 0.15539984 ... 0.15742464 0.14185585 0.13881453]
[0.09864708 0.09121073 0.10443842 ... 0.09390867 0.09298282 0.08643582]


In [231]:
#summary statistics
uplift = prob_discount - prob_nooffer

#create result df
result = test[['offer']].copy()
result['prob_discount'] = prob_discount
result['prob_nooffer'] = prob_nooffer
result['uplift'] = uplift

#prints top uplift customers
top_uplift = result[result['uplift'] > 0.05].sort_values(by='uplift', ascending=False)
print("Top Uplift > 5%:")
print(top_uplift.head())


#Q1, median, and Q3
Q1 = np.percentile(uplift, 25)
median = np.percentile(uplift, 50)
Q3 = np.percentile(uplift, 75)

print(f"Q1 (25th percentile) of uplift: {Q1:.4f}")
print(f"Median (50th percentile) of uplift: {median:.4f}")
print(f"Q3 (75th percentile) of uplift: {Q3:.4f}")

Top Uplift > 5%:
       offer  prob_discount  prob_nooffer    uplift
13368      0       0.265439      0.143293  0.122146
17400      0       0.249674      0.132028  0.117645
13749      0       0.227247      0.111290  0.115957
10139      0       0.232531      0.117442  0.115089
13564      0       0.239822      0.124745  0.115078
Q1 (25th percentile) of uplift: 0.0541
Median (50th percentile) of uplift: 0.0607
Q3 (75th percentile) of uplift: 0.0711
