In [1]:
%matplotlib inline

import math
import subprocess
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt

from collections import defaultdict
from scipy.stats.stats import pearsonr


In [2]:
def tra2rel(fileinput, fileoutput, delimiter=',', has_header=True):
    data = open(fileinput, 'r')
    if has_header:
        data.readline()
    baskets = defaultdict(list)

    for row in data:
        basket_id = row.replace('\r\n', '').split(delimiter)[0]
        item_id = row.replace('\r\n', '').split(delimiter)[1]
        baskets[basket_id].append(item_id)

    data.close()

    out = open(fileoutput, 'w')
    for k, v in baskets.iteritems():
        s = '%s' % k
        for item in v:
            s += ',%s' % item
        out.write('%s\n' % s)
    out.close()
    
    return baskets

In [3]:
def call_apriori(fileinput, fileoutput, delimiter=',', target_type='s', 
                 min_nbr_items=1, min_sup=2, min_conf=2):
    # apriori
    # -t# {m: maximal, c: closed, s: frequent, r: association rules}
    # -m# minimum number of items per item set/association rule
    # -s# minimum support of an item set, positive: percentage, negative: absolute
    # -c# minimum confidence rule percentage
    # -b# line delimiter (,)
    # The default additional information output format for rules is " (%X, %C)"
    # %X relative body set support as a percentage
    # %C rule confidence as a percentage
    # %L lift

    if target_type == 'r':
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, '-m%s' % min_nbr_items, 
                    '-s%s' % min_sup, '-c%s' % min_conf, '-v (%X, %C, %L)', 
                    fileinput, fileoutput]
    else:
        call_cmd = ['./apriori', '-b%s' % delimiter, '-t%s' % target_type, 
                           '-m%s' % min_nbr_items, '-s%s' % min_sup, fileinput, fileoutput]

    ret = subprocess.call(call_cmd,  stdout=open('apriori_stdout.txt', 'w'), 
                          stderr=open('apriori_stderr.txt', 'w'))
    return ret

In [4]:
def read_rules(filename):
    data = open(filename, 'r')
    rules = list()
    for row in data:
        fileds = row.rstrip('\n\r').split(' <- ')
        cons = fileds[0]
        other = fileds[1].split(' (')
        ant = other[0].split(' ')
        other2 = other[1].split(', ')
        sup = float(other2[0])
        conf = float(other2[1])
        lift = float(other2[2].replace(')', ''))
        rule = {
            'ant': ant,
            'cons': cons,
            'sup': sup,
            'conf': conf,
            'lift': lift
        }
        rules.append(rule)
    data.close()
    return rules

In [5]:
# Frequent Pattern Mining on Titanic Dataset

In [6]:
df = pd.read_csv("training.csv")

In [7]:
def clean_data(df):
    
    ages = sorted(df['VehicleAge'].unique())
    ages_mapping = dict(zip(ages, range(0, len(ages) + 1)))
    df['VehicleAge_Val'] = df['VehicleAge'].map(ages_mapping).astype(int)
    
    aucguarts = sorted(df['AUCGUART'].unique())
    aucguart_mapping = dict(zip(aucguarts, range(0, len(aucguarts) + 1)))
    df['AUCGUART_Val'] = df['AUCGUART'].map(aucguart_mapping).astype(int)
    
    df.replace({'VNZIP1' : { np.nan : 48796}}, inplace=True)
    df.replace({'VehBCost' : { np.nan : 6802}}, inplace=True)
    df.replace({'WarrantyCost' : { np.nan : 1297}}, inplace=True)
    
    df.dtypes[df.dtypes.map(lambda x: x == 'object')]
    
    df_train = df.drop(['PurchDate','Auction','Make','Model','Trim','SubModel','Color','Transmission','WheelType','Nationality','Size','TopThreeAmericanName','PRIMEUNIT','AUCGUART','VNST'], axis=1)
    df_train = df_train.drop(['RefId','VehYear','WheelTypeID','MMRAcquisitionAuctionAveragePrice','MMRAcquisitionAuctionCleanPrice','MMRAcquisitionRetailAveragePrice','MMRAcquisitonRetailCleanPrice','MMRCurrentAuctionAveragePrice','MMRCurrentAuctionCleanPrice','MMRCurrentRetailAveragePrice','MMRCurrentRetailCleanPrice','BYRNO','IsOnlineSale'], axis=1)

    
    return df_train

In [30]:
df2 = clean_data(df)

In [31]:
df2.head()

Unnamed: 0,IsBadBuy,VehicleAge,VehOdo,VNZIP1,VehBCost,WarrantyCost,VehicleAge_Val,AUCGUART_Val
0,0,3,89046,33619.0,7100.0,1113.0,2,0
1,0,5,93593,33619.0,7600.0,1053.0,4,0
2,0,4,73807,33619.0,4900.0,1389.0,3,0
3,0,5,65617,33619.0,4100.0,630.0,4,0
4,0,4,69367,33619.0,4000.0,1020.0,3,0


In [32]:
df2['VehOdoGroup'] = pd.cut(df2['VehOdo'], bins=range(0, 100005, 10000), right=False, labels=range(0, 100000, 10000))

In [33]:
df2.head()

Unnamed: 0,IsBadBuy,VehicleAge,VehOdo,VNZIP1,VehBCost,WarrantyCost,VehicleAge_Val,AUCGUART_Val,VehOdoGroup
0,0,3,89046,33619.0,7100.0,1113.0,2,0,80000
1,0,5,93593,33619.0,7600.0,1053.0,4,0,90000
2,0,4,73807,33619.0,4900.0,1389.0,3,0,70000
3,0,5,65617,33619.0,4100.0,630.0,4,0,60000
4,0,4,69367,33619.0,4000.0,1020.0,3,0,60000


In [34]:
df2['WarrantyCost'] = pd.cut(df2['WarrantyCost'], bins=range(0, 3005, 500), right=False, labels=range(0, 3000, 500))

In [35]:
df2.drop(['VehOdo', 'WarrantyCost', 'VehicleAge_Val'], axis=1, inplace=True)
df2.head()

Unnamed: 0,IsBadBuy,VehicleAge,VNZIP1,VehBCost,AUCGUART_Val,VehOdoGroup
0,0,3,33619.0,7100.0,0,80000
1,0,5,33619.0,7600.0,0,90000
2,0,4,33619.0,4900.0,0,70000
3,0,5,33619.0,4100.0,0,60000
4,0,4,33619.0,4000.0,0,60000


In [36]:
df3 = df2
df3['IsBadBuy'] = df2['IsBadBuy'].astype(str) + '_IBD'
df3['VehicleAge'] = df2['VehicleAge'].astype(str) + '_VA'
df3['VNZIP1'] = df2['VNZIP1'].astype(str) + '_ZIP'
df3['VehBCost'] = df2['VehBCost'].astype(str) + '_VBC'
df3['AUCGUART_Val'] = df2['AUCGUART_Val'].astype(str) + '_AUC'
df3['VehOdoGroup'] = df2['VehOdoGroup'].astype(str) + '_ODO'

In [37]:
df3.head()

Unnamed: 0,IsBadBuy,VehicleAge,VNZIP1,VehBCost,AUCGUART_Val,VehOdoGroup
0,0_IBD,3_VA,33619.0_ZIP,7100.0_VBC,0_AUC,80000.0_ODO
1,0_IBD,5_VA,33619.0_ZIP,7600.0_VBC,0_AUC,90000.0_ODO
2,0_IBD,4_VA,33619.0_ZIP,4900.0_VBC,0_AUC,70000.0_ODO
3,0_IBD,5_VA,33619.0_ZIP,4100.0_VBC,0_AUC,60000.0_ODO
4,0_IBD,4_VA,33619.0_ZIP,4000.0_VBC,0_AUC,60000.0_ODO


In [38]:
df2.to_csv('for_pattern.csv', header=False)

In [39]:
delimiter=','
target_type='s'
min_nbr_items=2
min_sup=2
min_conf=2

ret_val = call_apriori('for_pattern.csv', 'patterns.txt', 
                       delimiter, target_type, min_nbr_items, min_sup, min_conf)

In [40]:
delimiter=','
target_type='r'
min_nbr_items=2
min_sup=2
min_conf=25

ret_val = call_apriori('for_pattern.csv', 'rules.txt', delimiter, target_type, 
                       min_nbr_items, min_sup, min_conf)

In [41]:
rules = read_rules('rules.txt')
for r in rules[:100]:
    print r['ant'], '-->', r['cons'], ' lift', r['lift'], ' conf', r['conf']

['60445.0_ZIP'] --> 1_IBD  lift 213.951  conf 26.2712
['60445.0_ZIP'] --> 3_VA  lift 129.032  conf 28.8136
['60445.0_ZIP'] --> 4_VA  lift 119.334  conf 27.9661
['60445.0_ZIP'] --> 80000.0_ODO  lift 111.767  conf 27.1186
['60445.0_ZIP'] --> 0_IBD  lift 84.0493  conf 73.7288
['60445.0_ZIP'] --> 0_AUC  lift 87.5967  conf 81.3559
['17406.0_ZIP', '0_IBD'] --> 60000.0_ODO  lift 122.597  conf 25.2101
['17406.0_ZIP', '0_IBD'] --> 70000.0_ODO  lift 127.472  conf 35.2941
['17406.0_ZIP', '0_AUC'] --> 70000.0_ODO  lift 126.974  conf 35.1562
['17406.0_ZIP'] --> 70000.0_ODO  lift 129.917  conf 35.9712
['17406.0_ZIP', '0_IBD'] --> 0_AUC  lift 97.7182  conf 90.7563
['17406.0_ZIP', '0_AUC'] --> 0_IBD  lift 96.1857  conf 84.375
['17406.0_ZIP'] --> 0_IBD  lift 97.5953  conf 85.6115
['17406.0_ZIP'] --> 0_AUC  lift 99.1502  conf 92.0863
['60440.0_ZIP', '0_IBD', '0_AUC'] --> 3_VA  lift 158.052  conf 35.2941
['60440.0_ZIP', '0_IBD'] --> 3_VA  lift 149.272  conf 33.3333
['60440.0_ZIP', '0_AUC'] --> 3_VA  lift

In [42]:
rulse_cons_S = list()
for r in rules:
    if r['cons'].endswith('_S'):
        rulse_cons_S.append(r)

In [43]:
print len(rulse_cons_S)

0


In [44]:
sorted_rules_cons_S = sorted(rulse_cons_S, key=lambda r: r['conf'], reverse=True)

In [45]:
for r in sorted_rules_cons_S[:10]:
    print r['ant'], '-->', r['cons'], ' lift', r['lift'], ' conf', r['conf']

In [46]:
df3.values[0]

array(['0_IBD', '3_VA', '33619.0_ZIP', '7100.0_VBC', '0_AUC', '80000.0_ODO'], dtype=object)

In [47]:
passenger_test = df3.values[0]

In [48]:
for r in rules:
    if set(r['ant']) < set(passenger_test):
        print r['ant'], '-->', r['cons']

['33619.0_ZIP', '80000.0_ODO', '0_IBD', '0_AUC'] --> 5_VA
['33619.0_ZIP', '80000.0_ODO', '0_IBD'] --> 5_VA
['33619.0_ZIP', '80000.0_ODO', '0_AUC'] --> 5_VA
['33619.0_ZIP', '80000.0_ODO'] --> 5_VA
['33619.0_ZIP', '3_VA', '0_AUC'] --> 0_IBD
['33619.0_ZIP', '3_VA'] --> 0_IBD
['33619.0_ZIP', '3_VA'] --> 0_AUC
['33619.0_ZIP', '80000.0_ODO', '0_IBD'] --> 0_AUC
['33619.0_ZIP', '80000.0_ODO', '0_AUC'] --> 0_IBD
['33619.0_ZIP', '80000.0_ODO'] --> 0_IBD
['33619.0_ZIP', '80000.0_ODO'] --> 0_AUC
['33619.0_ZIP', '0_IBD', '0_AUC'] --> 70000.0_ODO
['33619.0_ZIP', '0_IBD'] --> 70000.0_ODO
['33619.0_ZIP', '0_AUC'] --> 70000.0_ODO
['33619.0_ZIP'] --> 70000.0_ODO
['33619.0_ZIP', '0_IBD'] --> 0_AUC
['33619.0_ZIP', '0_AUC'] --> 0_IBD
['33619.0_ZIP'] --> 0_IBD
['33619.0_ZIP'] --> 0_AUC
['3_VA', '80000.0_ODO', '0_IBD'] --> 0_AUC
['3_VA', '80000.0_ODO', '0_AUC'] --> 0_IBD
['3_VA', '80000.0_ODO'] --> 0_IBD
['3_VA', '80000.0_ODO'] --> 0_AUC
['3_VA', '0_IBD', '0_AUC'] --> 70000.0_ODO
['3_VA', '0_IBD'] --> 70000.