In [32]:
import numpy as np
import scipy
import scipy.special as spec
import oracles

In [33]:
from scipy import sparse

In [34]:
# np.random.seed(4181)
# l2_coef = np.random.randint(0, 10)
# l, d = 1000, 10
# my_oracle = BinaryLogistic(l2_coef=l2_coef)
# X = sparse.csr_matrix(np.random.random((l, d)))
# y = np.random.randint(0, 2, l) * 2 - 1
# w = np.random.random(d)
# res = my_oracle.func(X, y, w)
# print(res)

In [35]:
import oracles_mam

In [36]:
bn_log_mam = oracles_mam.BinaryLogistic(1)

In [37]:
bn_log = oracles.BinaryLogistic(1)

In [38]:
w = np.array([1, 2, 3])
X = np.array([[0, 1, 2], 
              [3, 4, 5]])
y = np.array([1, -1])
bn_log.func(X, y, w)

19.500167703189

In [39]:
bn_log_mam.func(X, y, w)

20.000167703189

In [40]:
bn_log.grad(X, y, w)

array([2.5       , 3.99983232, 5.49966465])

In [41]:
bn_log_mam.grad(X, y, w)

array([2.5       , 3.99983232, 5.49966465])

In [42]:
predictions = X.dot(w)
predictions = np.where(predictions > 0, 1, -1)


In [43]:
predictions

array([1, 1])

In [44]:
proba = spec.expit(X.dot(w))
np.array([proba, 1 - proba]).T

array([[9.99664650e-01, 3.35350130e-04],
       [1.00000000e+00, 5.10902431e-12]])

In [45]:
1 - proba

array([3.35350130e-04, 5.10902431e-12])

In [46]:
(w + 1e-3 - w) / 1e-3

array([1., 1., 1.])

In [47]:
w

array([1, 2, 3])

In [48]:
import utils

In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [50]:
train_data = pd.read_csv('data_toxic/toxic_train.csv', index_col=0)
test_data = pd.read_csv('data_toxic/toxic_test.csv', index_col=0)

In [51]:
train_data_processed_1 = train_data.copy()
test_data_processed_1 = test_data.copy()

In [52]:
import re

def text_process_1(texts_series):
    """
    Takes series with texts
    returns series with preprocessed texts
    (lower register + changed all symbols, which are not letter, number to spaces)
    """
    
    texts_series = texts_series.str.lower()
    texts_series = texts_series.apply(lambda x: re.sub(r'\W', ' ', x))
    return texts_series

In [53]:
train_data_processed_1 = text_process_1(train_data_processed_1['comment_text'])
test_data_processed_1 = text_process_1(test_data_processed_1['comment_text'])

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [91]:
countVect = CountVectorizer(min_df=3e-5)

In [92]:
train_data_processed_2 = countVect.fit_transform(train_data_processed_1)
test_data_processed_2 = countVect.transform(test_data_processed_1)

In [93]:
train_data_processed_2

<52061x37832 sparse matrix of type '<class 'numpy.int64'>'
	with 2055136 stored elements in Compressed Sparse Row format>

In [94]:
import optimization
import optimization_mam

In [95]:
target_train = train_data['is_toxic'].map({True : 1, False : -1})
train_data.drop(columns='is_toxic', inplace=True)
target_test = test_data['is_toxic'].map({True : 1, False : -1})
test_data.drop(columns='is_toxic', inplace=True)

In [125]:
step_alpha = 1
step_beta = 0
w_0 = np.zeros(train_data_processed_2.shape[1])
clf = optimization.SGDClassifier(step_alpha=step_alpha,
                                 step_beta=step_beta, batch_size=1000,
                                 experiment=True, max_iter=10000,
                                 l2_coef=5e-4)

In [126]:
clf.fit(train_data_processed_2, target_train.values, w_0=w_0, trace=True)['accuracy']

[0.6919777194442666,
 0.814392726807094,
 0.8356488891734426,
 0.8339842499519816,
 0.8428196427428133,
 0.8445483065497151,
 0.850630642166592,
 0.8529995518279019,
 0.8560727319290607,
 0.8612587233497663,
 0.8569690761252321,
 0.8686855752609002,
 0.8592739612011012,
 0.8597861578846276,
 0.864395928036366,
 0.8725910749727895,
 0.8698380177988347,
 0.8701581407260388,
 0.8701581407260388,
 0.8694538702861899,
 0.8628593379857865,
 0.8720148537038223,
 0.8698380177988347,
 0.872783148729112,
 0.881170369421858,
 0.8731672962417568,
 0.8732313208271977,
 0.8807221973237723,
 0.8769447467827646,
 0.8765605992701198,
 0.8758563288302709,
 0.8748319354632179,
 0.876816697611883,
 0.8815545169345028,
 0.8742557141942506,
 0.8727191241436711,
 0.8790575581023113,
 0.8720788782892631,
 0.8732953454126384,
 0.8861002625008003,
 0.8812343940072987,
 0.8796978039567194,
 0.8712465586785325,
 0.8793136564440746,
 0.8859081887444779,
 0.8823228119597926,
 0.8764965746846789,
 0.8723990012164671

In [31]:
(clf.predict(test_data_processed_2) == target_test.values).sum() / target_test.shape[0]

0.839572451151093

In [None]:
clf_mam = optimization_mam.GDClassifier(step_alpha=step_alpha,
                                        step_beta=step_beta,
                                        loss_function='binary_logistic',
                                        l2_coef=0, max_iter=1000)

In [None]:
hstr = clf_mam.fit(train_data_processed_2, target_train.values, trace=True, calc_accuracy=True, w_0=w_0)

In [None]:
len(hstr['func'])

In [None]:
hstr['func']

In [None]:
len(clf.history['func'])

In [None]:
clf.history['func']

In [None]:
clf.history['accuracy']

In [None]:
np.array(hstr['accuracy'])[0] + np.array(clf.history['accuracy'])[0]

In [None]:
hstr['accuracy']

In [None]:
clf.history['accuracy']