# Perceptron

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

## Cleaning + Preparing Data

In [2]:
filename = r'dir\risk-train.txt'

def replace(path):
    return path.replace('\\', '/')

df = pd.read_csv(replace(filename), sep = '\t')

In [3]:
def freq(column):
    '''Count frequency of each column'''
    count = {}
    for row in column:
        if row in count: count[row] += 1
        else: count[row] = 1
    return count

In [4]:
def trimming(df, threshold):
    '''Only keep column that has <= number of unique values'''

    if type(threshold) != int:
        raise TypeError('Threshold must be an integer')
    
    _max = np.max([len(df[column].unique()) for column in df.columns])
    if threshold > _max:
        raise ValueError('Number of threshold has exceeded the true value: %d' %(_max))
    
    #extract all the column names
    columns = [column for column in df.columns]
    column_to_drop = []

    #find out which one to be trimmed
    for column in columns:
        if len(df[column].unique()) > threshold:
            column_to_drop.append(column)
    #new_df
    new_df = df.drop(column_to_drop, axis = 1)
    return new_df

In [5]:
def freq_of_each_column(df):
    '''Count the frequency of value of each column'''
    
    columns = [column for column in df.columns]
    len_row = len(df)
    count = {}
    
    for column in columns:
        # Nested dictionary for frequency
        # {column: {pattern: count}, etc.}
        count[column] = {} 
        
        for row in range(len_row):
            value = df[column][row]
            if value in count[column]: count[column][value] += 1
            else: count[column][value] = 1
    
    sorted_result = [sorted(value.items(), key = lambda x: x[1], reverse = True) for value in count.values()]
    return sorted_result

In [6]:
def substitute(df):
    '''Unknown Value = Value that is closest to Mean of all counts of dataframe[column]'''
    
    for column in df.columns:
        if '?' in set(df[column].unique()):
            # Observe that if ? exists in a column, it will have the highest count
            # ==> freq(column) = {?: count, a: count, b: count, etc.}
            # ==> consider only from values starting from 'a': [count a, count b, etc.] 
            mean = np.array(list(freq(df[column]).values())).mean()
            values = list(freq(df[column]).values())[1:]
            distance = [(x - mean, x) for x in values]
            track = sorted(distance, reverse = True)[0][1]
            
            for k, v in freq(df[column]).items():
                if track == v:
                    df[column].replace('?', k, inplace = True)
    return df

In [7]:
def convert(new_df):
    
    '''
    If unique_value = {yes, no} ==> Yes = 1, No = 0
    Else: unique_value = [a, b, c, d, etc.] ==> {a: 1, b: 2, etc.}
    '''
    
    len_row = len(new_df)
    
    for column in new_df.columns:
        if set(new_df[column].unique()) == {'yes', 'no'}:
            for row in range(len_row):
                if new_df[column][row] == 'yes': new_df[column][row] = float(1)
                elif new_df[column][row] == 'no': new_df[column][row] = float(0)
    
        else:
            items = list(new_df[column].unique())
            orders = [i for i in range(1, len(items) + 1)]
            values = dict(zip(items, orders))
            
            for row in range(len_row):
                key = new_df[column][row]
                new_df[column][row] = float(values.get(key))
    
    return new_df

In [8]:
def checking_freq(dataframe):
    '''Checking the frequency of all columns'''
    return list(
        zip(
            [column for column in dataframe.columns], 
            dataframe.shape[1] * ["-->"], 
            freq_of_each_column(dataframe)))

In [9]:
def unique_value(df):
    '''Output a set of unique value of each column'''
    for column in df.columns:
        print('%s --> %s' % (column, str(set(df[column].unique()))))

## Perform checking and cleaning data

In [10]:
df.shape

(30000, 44)

In [11]:
df.head(2)

Unnamed: 0,ORDER_ID,CLASS,B_EMAIL,B_TELEFON,B_BIRTHDATE,FLAG_LRIDENTISCH,FLAG_NEWSLETTER,Z_METHODE,Z_CARD_ART,Z_CARD_VALID,...,FAIL_RPLZ,FAIL_RORT,FAIL_RPLZORTMATCH,SESSION_TIME,NEUKUNDE,AMOUNT_ORDER_PRE,VALUE_ORDER_PRE,DATE_LORDER,MAHN_AKT,MAHN_HOECHST
0,49917,no,yes,no,1/17/1973,yes,yes,check,?,5.2006,...,no,no,no,8,yes,0,0.0,?,?,?
1,49919,no,yes,yes,12/8/1970,no,no,credit_card,Visa,12.2007,...,yes,no,no,13,yes,0,0.0,?,?,?


In [12]:
df.columns

Index(['ORDER_ID', 'CLASS', 'B_EMAIL', 'B_TELEFON', 'B_BIRTHDATE',
       'FLAG_LRIDENTISCH', 'FLAG_NEWSLETTER', 'Z_METHODE', 'Z_CARD_ART',
       'Z_CARD_VALID', 'Z_LAST_NAME', 'VALUE_ORDER', 'WEEKDAY_ORDER',
       'TIME_ORDER', 'AMOUNT_ORDER', 'ANUMMER_01', 'ANUMMER_02', 'ANUMMER_03',
       'ANUMMER_04', 'ANUMMER_05', 'ANUMMER_06', 'ANUMMER_07', 'ANUMMER_08',
       'ANUMMER_09', 'ANUMMER_10', 'CHK_LADR', 'CHK_RADR', 'CHK_KTO',
       'CHK_CARD', 'CHK_COOKIE', 'CHK_IP', 'FAIL_LPLZ', 'FAIL_LORT',
       'FAIL_LPLZORTMATCH', 'FAIL_RPLZ', 'FAIL_RORT', 'FAIL_RPLZORTMATCH',
       'SESSION_TIME', 'NEUKUNDE', 'AMOUNT_ORDER_PRE', 'VALUE_ORDER_PRE',
       'DATE_LORDER', 'MAHN_AKT', 'MAHN_HOECHST'],
      dtype='object')

In [13]:
print('Number of unique values of:\n')
for column in df.columns:
    print('{%s} ---> {%d}' % (column, len(df[column].unique())))

Number of unique values of:

{ORDER_ID} ---> {30000}
{CLASS} ---> {2}
{B_EMAIL} ---> {2}
{B_TELEFON} ---> {2}
{B_BIRTHDATE} ---> {11143}
{FLAG_LRIDENTISCH} ---> {2}
{FLAG_NEWSLETTER} ---> {2}
{Z_METHODE} ---> {4}
{Z_CARD_ART} ---> {5}
{Z_CARD_VALID} ---> {36}
{Z_LAST_NAME} ---> {3}
{VALUE_ORDER} ---> {2341}
{WEEKDAY_ORDER} ---> {7}
{TIME_ORDER} ---> {1440}
{AMOUNT_ORDER} ---> {9}
{ANUMMER_01} ---> {560}
{ANUMMER_02} ---> {561}
{ANUMMER_03} ---> {561}
{ANUMMER_04} ---> {499}
{ANUMMER_05} ---> {348}
{ANUMMER_06} ---> {179}
{ANUMMER_07} ---> {90}
{ANUMMER_08} ---> {34}
{ANUMMER_09} ---> {8}
{ANUMMER_10} ---> {1}
{CHK_LADR} ---> {2}
{CHK_RADR} ---> {2}
{CHK_KTO} ---> {2}
{CHK_CARD} ---> {2}
{CHK_COOKIE} ---> {2}
{CHK_IP} ---> {2}
{FAIL_LPLZ} ---> {2}
{FAIL_LORT} ---> {2}
{FAIL_LPLZORTMATCH} ---> {2}
{FAIL_RPLZ} ---> {2}
{FAIL_RORT} ---> {2}
{FAIL_RPLZORTMATCH} ---> {2}
{SESSION_TIME} ---> {24}
{NEUKUNDE} ---> {2}
{AMOUNT_ORDER_PRE} ---> {7}
{VALUE_ORDER_PRE} ---> {8232}
{DATE_LORDER} ---> 

In [14]:
new_df = trimming(df, threshold = 6)
new_df.head(2)

Unnamed: 0,CLASS,B_EMAIL,B_TELEFON,FLAG_LRIDENTISCH,FLAG_NEWSLETTER,Z_METHODE,Z_CARD_ART,Z_LAST_NAME,ANUMMER_10,CHK_LADR,...,CHK_IP,FAIL_LPLZ,FAIL_LORT,FAIL_LPLZORTMATCH,FAIL_RPLZ,FAIL_RORT,FAIL_RPLZORTMATCH,NEUKUNDE,MAHN_AKT,MAHN_HOECHST
0,no,yes,no,yes,yes,check,?,?,?,no,...,no,no,no,no,no,no,no,yes,?,?
1,no,yes,yes,no,no,credit_card,Visa,yes,?,no,...,no,no,no,no,yes,no,no,yes,?,?


In [15]:
unique_value(new_df)

CLASS --> {'no', 'yes'}
B_EMAIL --> {'no', 'yes'}
B_TELEFON --> {'no', 'yes'}
FLAG_LRIDENTISCH --> {'no', 'yes'}
FLAG_NEWSLETTER --> {'no', 'yes'}
Z_METHODE --> {'debit_card', 'debit_note', 'credit_card', 'check'}
Z_CARD_ART --> {'Amex', 'debit_card', 'Visa', '?', 'Eurocard'}
Z_LAST_NAME --> {'no', 'yes', '?'}
ANUMMER_10 --> {'?'}
CHK_LADR --> {'no', 'yes'}
CHK_RADR --> {'no', 'yes'}
CHK_KTO --> {'no', 'yes'}
CHK_CARD --> {'no', 'yes'}
CHK_COOKIE --> {'no', 'yes'}
CHK_IP --> {'no', 'yes'}
FAIL_LPLZ --> {'no', 'yes'}
FAIL_LORT --> {'no', 'yes'}
FAIL_LPLZORTMATCH --> {'no', 'yes'}
FAIL_RPLZ --> {'no', 'yes'}
FAIL_RORT --> {'no', 'yes'}
FAIL_RPLZORTMATCH --> {'no', 'yes'}
NEUKUNDE --> {'no', 'yes'}
MAHN_AKT --> {'2', '0', '1', '3', '?'}
MAHN_HOECHST --> {'2', '0', '1', '3', '?'}


**Notice** Column *ANUMMER_10* contains only unknown value ==> Unncessary to keep. 

In [16]:
new_df = new_df.drop('ANUMMER_10', axis = 1)

In [17]:
# Replace unknown values

update_df = substitute(new_df)

In [18]:
update_df.head(2)

Unnamed: 0,CLASS,B_EMAIL,B_TELEFON,FLAG_LRIDENTISCH,FLAG_NEWSLETTER,Z_METHODE,Z_CARD_ART,Z_LAST_NAME,CHK_LADR,CHK_RADR,...,CHK_IP,FAIL_LPLZ,FAIL_LORT,FAIL_LPLZORTMATCH,FAIL_RPLZ,FAIL_RORT,FAIL_RPLZORTMATCH,NEUKUNDE,MAHN_AKT,MAHN_HOECHST
0,no,yes,no,yes,yes,check,Eurocard,yes,no,no,...,no,no,no,no,no,no,no,yes,0,0
1,no,yes,yes,no,no,credit_card,Visa,yes,no,no,...,no,no,no,no,yes,no,no,yes,0,0


**Notice** Z_CARD_ART[0]: ? - value has been replaced by "Eurocard"

In [19]:
update_df.shape

(30000, 23)

In [20]:
converted_df = convert(update_df)
converted_df.head(2)

Unnamed: 0,CLASS,B_EMAIL,B_TELEFON,FLAG_LRIDENTISCH,FLAG_NEWSLETTER,Z_METHODE,Z_CARD_ART,Z_LAST_NAME,CHK_LADR,CHK_RADR,...,CHK_IP,FAIL_LPLZ,FAIL_LORT,FAIL_LPLZORTMATCH,FAIL_RPLZ,FAIL_RORT,FAIL_RPLZORTMATCH,NEUKUNDE,MAHN_AKT,MAHN_HOECHST
0,0,1,0,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,1,1,1
1,0,1,1,0,0,2,2,1,0,0,...,0,0,0,0,1,0,0,1,1,1


In [21]:
# Check whether unknown value is replaced
unique_value(converted_df)

CLASS --> {0.0, 1.0}
B_EMAIL --> {0.0, 1.0}
B_TELEFON --> {0.0, 1.0}
FLAG_LRIDENTISCH --> {0.0, 1.0}
FLAG_NEWSLETTER --> {0.0, 1.0}
Z_METHODE --> {1.0, 2.0, 3.0, 4.0}
Z_CARD_ART --> {1.0, 2.0, 3.0, 4.0}
Z_LAST_NAME --> {0.0, 1.0}
CHK_LADR --> {0.0, 1.0}
CHK_RADR --> {0.0, 1.0}
CHK_KTO --> {0.0, 1.0}
CHK_CARD --> {0.0, 1.0}
CHK_COOKIE --> {0.0, 1.0}
CHK_IP --> {0.0, 1.0}
FAIL_LPLZ --> {0.0, 1.0}
FAIL_LORT --> {0.0, 1.0}
FAIL_LPLZORTMATCH --> {0.0, 1.0}
FAIL_RPLZ --> {0.0, 1.0}
FAIL_RORT --> {0.0, 1.0}
FAIL_RPLZORTMATCH --> {0.0, 1.0}
NEUKUNDE --> {0.0, 1.0}
MAHN_AKT --> {1.0, 2.0, 3.0, 4.0}
MAHN_HOECHST --> {1.0, 2.0, 3.0, 4.0}


In [22]:
checking_freq(converted_df)

[('CLASS', '-->', [(0.0, 28254), (1.0, 1746)]),
 ('B_EMAIL', '-->', [(1.0, 23963), (0.0, 6037)]),
 ('B_TELEFON', '-->', [(0.0, 25493), (1.0, 4507)]),
 ('FLAG_LRIDENTISCH', '-->', [(1.0, 21157), (0.0, 8843)]),
 ('FLAG_NEWSLETTER', '-->', [(0.0, 28772), (1.0, 1228)]),
 ('Z_METHODE', '-->', [(1.0, 14808), (2.0, 9796), (3.0, 3846), (4.0, 1550)]),
 ('Z_CARD_ART', '-->', [(1.0, 23750), (2.0, 3927), (3.0, 1550), (4.0, 773)]),
 ('Z_LAST_NAME', '-->', [(1.0, 29245), (0.0, 755)]),
 ('CHK_LADR', '-->', [(0.0, 28865), (1.0, 1135)]),
 ('CHK_RADR', '-->', [(0.0, 29889), (1.0, 111)]),
 ('CHK_KTO', '-->', [(0.0, 29912), (1.0, 88)]),
 ('CHK_CARD', '-->', [(0.0, 29893), (1.0, 107)]),
 ('CHK_COOKIE', '-->', [(0.0, 29905), (1.0, 95)]),
 ('CHK_IP', '-->', [(0.0, 29879), (1.0, 121)]),
 ('FAIL_LPLZ', '-->', [(0.0, 29835), (1.0, 165)]),
 ('FAIL_LORT', '-->', [(0.0, 29849), (1.0, 151)]),
 ('FAIL_LPLZORTMATCH', '-->', [(0.0, 29772), (1.0, 228)]),
 ('FAIL_RPLZ', '-->', [(0.0, 29674), (1.0, 326)]),
 ('FAIL_RORT',

In [23]:
converted_df.shape

(30000, 23)

## Machine Learning

In [24]:
value_df = converted_df.values
np.random.seed(10)
np.random.shuffle(value_df)

In [25]:
# Rule of thumb: train-test: 80-20 or 70-30

train = value_df[0:int(0.8*len(value_df))]
test = value_df[int(0.2*len(value_df)):int(len(value_df))]

In [26]:
# test if having selected the correct column
train[:, 0]

array([0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0], dtype=object)

In [27]:
# Test with y = Class, x = all the remained columns

x_train = train[:, 1:(value_df.shape[1] - 1)]
y_train = train[:, 0]

x_test = test[:, 1:(value_df.shape[1] - 1)]
y_test = test[:, 0]

In [28]:
def perceptron_train(x, y, z, eta, t, changing_rate = False):
    '''
    Input Parameters:
        x: data set of input features
        y: actual outputs
        z: activation function threshold
        eta: learning rate
        t: number of iterations
    '''
    
    # initializing the weights
    w = np.zeros(len(x[0]))      
    n = 0

    yhat_vec = np.ones(len(y))     # vector for predictions
    
    c1 = 1
    c2 = 2
    
    while n < t: 
        for i in range(0, len(x)): 
            f = np.dot(x[i], w)    # dot product 
            
            # prediction function 
            if f >= z:                               
                yhat = 1.                               
            else:                                   
                yhat = 0.
            yhat_vec[i] = yhat

            # updating the weights
            for j in range(0, len(w)):             
                w[j] = w[j] + eta*(y[i]-yhat)*x[i][j]
        
        if changing_rate != False:
          eta = c1/(n + c2)  
        
        n += 1
    return w

z = 0.0
eta = 0.1
t = 50

In [29]:
w = perceptron_train(x_train, y_train, z, eta, t, changing_rate = False)

def perceptron_test(x, w, z, eta, t):
    y_pred = []
    for i in range(0, len(x)):
        f = np.dot(x[i], w)   #dot product
        
        # prediction function
        if f > z:                               
            yhat = 1                               
        else:                                   
            yhat = 0
        y_pred.append(yhat)
    return y_pred

y_pred = perceptron_test(x_test, w, z, eta, t)

In [30]:
# Check the accuracy of self-made Perceptron

print(accuracy_score(list(y_test), y_pred))

0.9415


In [31]:
# Check with the accuracy of API Perceptron

clf = Perceptron(random_state=None, eta0=0.1, shuffle=False, fit_intercept=False)
clf.fit(list(x_train), list(y_train))
y_predict = clf.predict(list(x_test))
clf.score(list(x_train), list(y_train))



0.9417916666666667

In [32]:
# Accuracy with changing rate

w = perceptron_train(x_train, y_train, z, eta, t, changing_rate = True)
y_pred = perceptron_test(x_test, w, z, eta, t)
print(accuracy_score(list(y_test), y_pred))

0.9414583333333333
