In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import max as max_, min as min_, mean as mean_

sc = SparkContext(master = "local")
spark = SparkSession.builder.master("local").getOrCreate()

In [2]:
spam_dataset = sc.textFile("./dataset/spam.data.txt") \
    .map(lambda line: line.split(' ')) \
    .map(lambda line: [float(val) for val in line]) \
    .toDF() \
    .drop("_57")

In [3]:
# Before normalization
spam_dataset.select(spam_dataset.columns[:3]).describe().show()

+-------+-------------------+-------------------+------------------+
|summary|                 _1|                 _2|                _3|
+-------+-------------------+-------------------+------------------+
|  count|               4601|               4601|              4601|
|   mean|0.10455335796565962|0.21301456205172783|0.2806563790480323|
| stddev| 0.3053575620234701| 1.2905751909453216|0.5041428838471845|
|    min|                0.0|                0.0|               0.0|
|    max|               4.54|              14.28|               5.1|
+-------+-------------------+-------------------+------------------+



In [4]:
# Min-Max Normalization
for col in spam_dataset.columns[:len(spam_dataset.columns) - 1]:
    stats = spam_dataset.select(max_(col).alias('max'), min_(col).alias('min')).collect()[0]
    spam_dataset = spam_dataset.withColumn(col, (spam_dataset[col] - stats['min']) / (stats['max'] - stats['min']))

# After normalization
spam_dataset.select(spam_dataset.columns[:3]).describe().show()

+-------+--------------------+--------------------+--------------------+
|summary|                  _1|                  _2|                  _3|
+-------+--------------------+--------------------+--------------------+
|  count|                4601|                4601|                4601|
|   mean|0.023029374001246664|0.014916986138076233|0.055030662558437915|
| stddev| 0.06725937489503736| 0.09037641393174517| 0.09885154585238899|
|    min|                 0.0|                 0.0|                 0.0|
|    max|                 1.0|                 1.0|                 1.0|
+-------+--------------------+--------------------+--------------------+



In [5]:
train, test = spam_dataset.randomSplit(weights=[0.8, 0.2], seed=1)

print("Whole dataset: {}, {}".format(spam_dataset.count(), len(spam_dataset.columns)))
print("Train dataset: {}, {}".format(train.count(), len(train.columns)))
print("Test dataset: {}, {}".format(test.count(), len(test.columns)))

Whole dataset: 4601, 57
Train dataset: 3642, 57
Test dataset: 959, 57


In [6]:
X_train = train.select(train.columns[:len(train.columns) - 1])
y_train = train.select(train.columns[-1])

X_test = test.select(test.columns[:len(test.columns) - 1])
y_test = test.select(test.columns[-1])

In [7]:
X_train.select(X_train.columns[:4]).describe().show()
y_train.describe().show()
X_test.select(X_test.columns[:4]).describe().show()
y_test.describe().show()

+-------+--------------------+--------------------+-------------------+--------------------+
|summary|                  _1|                  _2|                 _3|                  _4|
+-------+--------------------+--------------------+-------------------+--------------------+
|  count|                3642|                3642|               3642|                3642|
|   mean|0.023053968991235366|0.015323290216690745|0.05546941456428836|0.001574136822333...|
| stddev| 0.06749383039738928| 0.09124493712987924|0.09907445181542128| 0.03254631960542416|
|    min|                 0.0|                 0.0|                0.0|                 0.0|
|    max|                 1.0|                 1.0|                1.0|  0.9981312777388459|
+-------+--------------------+--------------------+-------------------+--------------------+

+-------+-------------------+
|summary|                _58|
+-------+-------------------+
|  count|               3642|
|   mean|0.39236683141131246|
| stddev|0.4

In [4]:
class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            if(self.verbose == True and i % 10000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print('loss: {self.__loss(h, y)} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold


In [11]:
model = LogisticRegression(lr=0.05, num_iter=10000)
%time model.fit(X, y)
# CPU times: user 13.8 s, sys: 84 ms, total: 13.9 s
# Wall time: 13.8 s



CPU times: user 10.9 s, sys: 916 ms, total: 11.8 s
Wall time: 8.27 s


In [18]:
preds = model.predict(X, 0.5)
# accuracyf
(preds == y).mean()



0.61312758096066078

In [None]:
# TODO
# 1) Normalize features
# 2) Implement cross validation
# 3) Add regularization
# 4) 