## PU learning

In [1]:
from scipy import signal
from scipy import stats
from matplotlib import pyplot as plt
import numpy as np 
import pandas as pd

In [2]:
import sklearn as sk
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

### Data Creating

In [3]:
from sklearn import datasets, svm, metrics

# The digits dataset
digits = datasets.load_digits()

In [4]:
# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

In [5]:
# Let's say that 8 - is positive and all other is negative class
digits.target = np.array(digits.target==8, dtype=int)

### Test on properly defined data

In [6]:
# Create a classifier: a support vector classifier
classifier = svm.SVC(gamma=0.001)

# We learn the digits on the first half of the digits
classifier.fit(data[:3*n_samples // 4], digits.target[:3*n_samples // 4])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [7]:
# Now predict the value of the digit on the second half:
expected = digits.target[3*n_samples // 4:]
predicted = classifier.predict(data[3*n_samples // 4:])

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False):
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       409
           1       1.00      0.83      0.91        41

    accuracy                           0.98       450
   macro avg       0.99      0.91      0.95       450
weighted avg       0.98      0.98      0.98       450


Confusion matrix:
[[409   0]
 [  7  34]]


### Create unlabled(undefined) data

In [8]:
# Lets create labeled/unlabled data
digits["s"] = np.random.randint(0,2, len(digits.target))*digits.target

In [10]:
# Create a classifier: a support vector classifier
CL_g = svm.SVC(gamma=0.001, probability=True)

CL_g.fit(data[:3*n_samples // 4], digits.s[:3*n_samples // 4])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [11]:
# Now predict the value of the digit on the second half:
expected = digits.target[3*n_samples // 4:]
predicted = CL_g.predict(data[3*n_samples // 4:])

print("Classification report for classifier %s:\n%s\n"
      % (CL_g, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False):
              precision    recall  f1-score   support

           0       0.91      1.00      0.95       409
           1       0.00      0.00      0.00        41

    accuracy                           0.91       450
   macro avg       0.45      0.50      0.48       450
weighted avg       0.83      0.91      0.87       450


Confusion matrix:
[[409   0]
 [ 41   0]]


  'precision', 'predicted', average, warn_for)


Result on unlabled data with usual classifier is more than unsagnificant

### Create improvement with weighted train

In [12]:
x_columns = ["x"+str(i) for i in range(len(data[0]))]
df = pd.DataFrame(data, columns=x_columns)
df["y"] = digits.target
df["s"] = digits.s

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df[x_columns], df[["s", 'y']], test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=42)

In [14]:
def create_new_train(e, X_train, y_train, CL):
    X_train_new = pd.DataFrame(X_train.loc[y_train.index[y_train["s"] == 1]])
    X_train_new["w"] = 1
    y_train_new = pd.DataFrame([1]*len(X_train_new), columns=["s"])
    X_train_1 = pd.DataFrame(X_train.loc[y_train.index[y_train["s"] == 0]])
    g_x = CL.predict_proba(X_train_1[x_columns])[:,1]
    X_train_1["w"] = (1-e)/e * g_x/(1-g_x)
    y_train_1 = pd.DataFrame([1]*len(X_train_1), columns=["s"])
    X_train_2 = pd.DataFrame(X_train.loc[y_train.index[y_train["s"] == 0]])
    g_x = CL.predict_proba(X_train_2[x_columns])[:,1]
    X_train_2["w"] = 1 - (1-e)/e * g_x/ (1-g_x)
    y_train_2 = pd.DataFrame([0]*len(X_train_2), columns=["s"])
    X_train_new = X_train_new.append(X_train_1, ignore_index=True, sort=False)
    X_train_new = X_train_new.append(X_train_2, ignore_index=True, sort=False)
    y_train_new = y_train_new.append(y_train_1, ignore_index=True, sort=False)
    y_train_new = y_train_new.append(y_train_2, ignore_index=True, sort=False)
    return X_train_new, y_train_new

In [15]:
X_val_p = X_val.loc[X_val.index[y_val["s"] == 1]]
d_p = CL_g.predict_proba(X_val_p)[:,1]
e1 = d_p.sum()/len(X_val_p)
print(f"e1 = {e1}")
d_v = CL_g.predict_proba(X_val)[:,1]
e2 = d_p.sum()/d_v.sum()
print(f"e2 = {e2}")
e3 = d_v.max()
print(f"e3 = {e3}")

e1 = 0.4940622009601986
e2 = 0.5919204879175857
e3 = 0.99364475038768


In [26]:
CL_f =  svm.SVC(gamma=0.001, probability=True)
X_train_new, y_train_new = create_new_train(e2, X_train, y_train, CL_g)
CL_f.fit(X_train_new[x_columns], y_train_new["s"], sample_weight=X_train_new['w'])

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [27]:
expected = y_test["y"]
predicted = CL_f.predict(X_test)

print("Classification report for classifier %s:\n%s\n"
      % (CL_f, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False):
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       542
           1       1.00      0.37      0.54        52

    accuracy                           0.94       594
   macro avg       0.97      0.68      0.75       594
weighted avg       0.95      0.94      0.93       594


Confusion matrix:
[[542   0]
 [ 33  19]]
