In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff

In [2]:
data = arff.loadarff('datasets/risk-train.arff')
df = pd.DataFrame(data[0])
del df['Z_METHODE']
del df['Z_CARD_ART']
del df['WEEKDAY_ORDER']

In [3]:
def decision_classification(weight: np.array, value: np.array):
    answer = weight*value
    activation = np.sum(answer)
    
    return 1 if (activation > 0) else -1

In [4]:
def string_to_int(dataset: pd.DataFrame):
    df.CLASS = df.CLASS.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.B_EMAIL = df.B_EMAIL.replace({b'yes':1.0, b'no': -1.0, np.nan:0,b'?': 0.0})
    df.B_TELEFON = df.B_TELEFON.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.FLAG_LRIDENTISCH = df.FLAG_LRIDENTISCH.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.FLAG_NEWSLETTER = df.FLAG_NEWSLETTER.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.Z_CARD_VALID = df.Z_CARD_VALID.replace({np.nan: 0.0, b'?':0.0})
    df.Z_LAST_NAME = df.Z_LAST_NAME.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.VALUE_ORDER = df.VALUE_ORDER.replace({np.nan:0, b'?': 0.0})
    df.AMOUNT_ORDER = df.AMOUNT_ORDER.replace({np.nan:0, b'?': 0.0})
    df.CHK_LADR = df.CHK_LADR.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.CHK_RADR = df.CHK_RADR.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.CHK_KTO = df.CHK_KTO.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.CHK_CARD = df.CHK_CARD.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.CHK_COOKIE = df.CHK_COOKIE.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.CHK_IP = df.CHK_IP.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.FAIL_LPLZ = df.FAIL_LPLZ.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.FAIL_LORT = df.FAIL_LORT.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.FAIL_LPLZORTMATCH = df.FAIL_LPLZORTMATCH.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.FAIL_RPLZ = df.FAIL_RPLZ.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.FAIL_RORT = df.FAIL_RORT.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.FAIL_RPLZORTMATCH = df.FAIL_RPLZORTMATCH.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.SESSION_TIME = df.SESSION_TIME.replace({np.nan:0, b'?': 0.0})
    df.NEUKUNDE = df.NEUKUNDE.replace({b'yes':1.0, b'no': -1.0, np.nan:0, b'?': 0.0})
    df.AMOUNT_ORDER_PRE = df.AMOUNT_ORDER_PRE.replace({np.nan:0, b'?': 0.0})
    df.VALUE_ORDER_PRE = df.VALUE_ORDER_PRE.replace({np.nan: 0.0, b'?': 0.0})
    df.MAHN_AKT = df.MAHN_AKT.replace({np.nan:0, b'?': 0.0})
    df.MAHN_HOECHST = df.MAHN_HOECHST.replace({np.nan:0, b'?': 0.0})

In [9]:
def perceptron(df:pd.DataFrame, learningRate: float, changing:bool, percentage: float):
    # preprocessing the data
    string_to_int(df)

    # intializing a numpy array full of 0's
    w = np.zeros((1,df.shape[1]-1))
    learning_rate = learningRate
    num_correct = 0
    num_wrong = 0

    # splitting the data into train and test
    train, test = np.split(df, [int(len(df)*percentage)])

    # creating a new file to save the results of the algorithm in a csv format
    output = open("Task6_results.csv", "w")
    output.write("actual, predicted\n")

    # looping through the data in train set and getting the weight vector
    for item in train.index:
        v = np.array([df.loc[item][1:]])

        # implemented changing learning rate
        if (changing):
            if (decision_classification(w,v) != df.loc[item]['CLASS']):
                learning_rate = learning_rate - 0.05
                w = w + learning_rate*df.loc[item]['CLASS']*v
            else:
                learning_rate = learning_rate + 0.05
        # for fixed learning rate
        else:
            if (decision_classification(w,v) != df.loc[item]['CLASS']):
                w = w + learning_rate*df.loc[item]['CLASS']*v

    # testing the weight vector with test data to get accuracy of the model.
    for item in test.index:
        actual = decision_classification(w,v)
        predicted = df.loc[item]['CLASS']
        if (actual != predicted):
            num_wrong += 1
        else:
            num_correct += 1

        output.write(str(actual) + "," + str(predicted) + "\n")

    output.close()
    
    return num_correct, num_wrong

In [10]:
correct, wrong = perceptron(df, 0.25, True, 0.7)

In [11]:
print(correct, wrong)

8489 511


In [12]:
print(correct/(correct+wrong)*100)

94.32222222222222
