In [1]:
import numpy as np
import csv
from sklearn.model_selection import train_test_split

def sigmoid(x):
    return 1/(1 + np.exp(-x))

class LogisticRegression():
    def __init__(self, rate=1e-5, epochs=20000):
        self.rate = rate
        self.epochs = epochs
        self.weights = None
        self.bias = 0
            
    def train(self, feats, ys):
        n_samples, n_feats = feats.shape
        self.weights = np.zeros(n_feats)
        self.bias = 0

        for _ in range(self.epochs):
            self.descent(feats, ys, n_samples)

    def descent(self, feats, ys, n_samples):
        # Check the outcome first
        linear_preds = np.dot(feats, self.weights) + self.bias
        # and then make on a scale from 0.0 to 1.0
        predictions = sigmoid(linear_preds)

        d_weights = (1/n_samples) * np.dot(feats.T, (predictions - ys))
        d_bias = (1/n_samples) * np.sum(predictions - ys)

        self.weights = self.weights - d_weights * self.rate
        self.bias = self.bias - d_bias * self.rate

    def predict(self, feats):
        linear_preds = np.dot(feats, self.weights) + self.bias
        predictions = sigmoid(linear_preds)
        class_preds = []

        # This may be an error
        for i in range(len(predictions)): 
            if predictions[i] <= 0.5:
                class_preds.append(0)
            else:
                class_preds.append(1)
                
        return class_preds

def accuracy(tests, preds):
    return np.sum(preds == tests) / len(tests)

In [36]:
def main():
    with open("/home/ln/src/ml/data/water_potability.csv") as file:
        next(file)
        reader = csv.reader(file)
        data_csv = list(reader)
        
        for row in data_csv:
            for i, x in enumerate(row):
                if len(x)< 1:
                    x = row[i] = "0.0"

    data = np.array(data_csv)
    data = data.astype(float)
    print(data)

    feats = data[0, :-1]
    ys = data[:, -1]

    features_train, features_test, ys_train, ys_test = train_test_split(feats, ys, test_size=0.2, random_state=1234)                                                                                                                                                   

    clf = LogisticRegression()
    clf.train(features_train, ys_train)

    ys_pred = clf.predict(features_test)
    print(accuracy(ys_test, ys_pred))

if __name__ == "__main__":
    main()

[[0.00000000e+00 2.04890455e+02 2.07913190e+04 ... 8.69909705e+01
  2.96313538e+00 0.00000000e+00]
 [3.71608008e+00 1.29422921e+02 1.86300579e+04 ... 5.63290763e+01
  4.50065627e+00 0.00000000e+00]
 [8.09912419e+00 2.24236259e+02 1.99095417e+04 ... 6.64200925e+01
  3.05593375e+00 0.00000000e+00]
 ...
 [9.41951032e+00 1.75762646e+02 3.31555782e+04 ... 6.98454003e+01
  3.29887550e+00 1.00000000e+00]
 [5.12676292e+00 2.30603758e+02 1.19838694e+04 ... 7.74882131e+01
  4.70865847e+00 1.00000000e+00]
 [7.87467136e+00 1.95102299e+02 1.74041771e+04 ... 7.86984463e+01
  2.30914906e+00 1.00000000e+00]]


ValueError: Found input variables with inconsistent numbers of samples: [9, 3276]