In [None]:
import pickle
import pandas as pd
import urllib.request
import json
import time
import os
from datetime import datetime

import tensorflow as tf
from tensorflow import keras
import sklearn
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Bidirectional, GRU
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

# reproducible
from numpy.random import seed
seed(7)
from tensorflow.random import set_seed
set_seed(77)


print(tf.__version__)
print(sklearn.__version__)

# Load Data for Training

In [None]:
%%time
# prepare my data
with open("data/mydata.pkl", "rb") as f:
    mydata = pickle.load(f)

In [None]:
%%time
# prepare training data
with open("data/training_data.pkl", "rb") as f:
    data = pickle.load(f)

In [None]:
%%time
# prepare humanizer data
with open("data/humanizer_data.pkl", "rb") as f:
    humanizer_data = pickle.load(f)

# Util Functions

In [None]:
def get_image_features(model, user_json_str):
    """ 
    Parameters:
    -------------------
    model: loaded CV model for extracting image features
    user_json_str: User json file
    
    Return:
    -------------------
    userid, VGG16 features (25088 dim)
    """
    
    # load json
    user_json = json.loads(user_json_str)
    # get user id
    uid = user_json["id"]
    # get image url
    print("retrieving from {} for {}".format(user_json["profile_image_url_https"], uid))
    urllib.request.urlretrieve(user_json["profile_image_url_https"], "profile.jpg")
    
    # get features
    image = tf.keras.preprocessing.image.load_img("profile.jpg", target_size=(224,224))
    input_arr = keras.preprocessing.image.img_to_array(image)
    input_arr = np.array([input_arr])
    features = model.predict(input_arr)
#     print(features.shape)
    features = features.flatten()
    
    return uid, features

In [None]:
def get_stat_features(user_data):
    """
    Parameters
    ---------------------
    user_data: loaded json for user object
    
    Return
    ---------------------
    statistical features
    """
    url_exist = 1 if 'url' in user_data else 0
    default_profile = int(user_data['default_profile']) if 'default_profile' in user_data else 1
    verified = int(user_data['verified']) if 'verified' in user_data else 0
    desc_exist = 1 if 'description' in user_data else 0
    default_profile_image = 1 if 'default' in user_data['profile_image_url'] else 0
    listed_count = int(user_data['listed_count']) if 'listed_count' in user_data else 0
    followers = user_data['followers_count'] if 'followers_count' in user_data else 0
    followees = user_data['friends_count'] if 'friends_count' in user_data else 0
    posts = user_data['statuses_count'] if 'statuses_count' in user_data else 0
    favorites = user_data['favourites_count'] if 'favourites_count' in user_data else 0
    len_screenname = len(user_data['screen_name'])
    
    # engineered (check needed)
    ff_ratio = followers / (followees+1.)
    fff_ratio = followees / (followees+followers+1.)
    
    return np.log10(np.array([url_exist, 
                     default_profile, 
                     verified, 
                     desc_exist,
                     default_profile_image,
                     listed_count,
                     followers,
                     followees,
                     posts,
                     favorites,
                     len_screenname,
                     ff_ratio,
                     fff_ratio
                    ])+1.0)

In [None]:
def get_ml_img_data(data_pkl, img_features_pkl):
    """
    Parameters
    ------------------
    data_pkl: path to data pickle
    img_features_pkl: path to image feature pickle
    
    Return
    ------------------
    X, y for fitting
    """
    start_time = time.time()

    # prepare y_train
    with open(data_pkl, "rb") as f:
        data = pickle.load(f)

    y_train = list()
    for uid in data.keys():
        y_train.append(data[uid]['label'])

    print(len(y_train))
    
    y_train = np.array(y_train)

    # prepare X_train
    with open(img_features_pkl, "rb") as f:
        img_feature_dict = pickle.load(f)
    X_train = np.array(list(img_feature_dict.values()))

    print(X_train.shape)

    elapsed_time = time.time() - start_time
        
    print("Used {} seconds to load data".format(elapsed_time))
    
    return X_train, y_train


def get_ml_stat_data(data_pkl=None):
    start_time = time.time()
    
    data = None
    if data_pkl is not None:
        # prepare y_train
        with open(data_pkl, "rb") as f:
            data = pickle.load(f)

        X_train = list()
        y_train = list()
        for uid in data.keys():
            y_train.append(data[uid]['label'])
            json_data = json.loads(data[uid]['user'].AsJsonString())
            X_train.append(get_stat_features(json_data))

        y_train = [int(x) for x in y_train]
        y_train = np.array(y_train)

        X_train = np.array(X_train)
    else:
        if os.path.exists("data/X_np_training_stat_features.npy") and os.path.exists("data/y_np_training.npy"):
            print("Loading training data from existing files...")
            X_train = np.load("data/X_np_training_stat_features.npy")
            y_train = np.load("data/y_np_training.npy")
        else:
            print("Nothing to load...")
    
    print(X_train.shape, y_train.shape)
    
    elapsed_time = time.time() - start_time
        
    print("Used {} seconds to load data".format(elapsed_time))
    
    return X_train, y_train, data

def get_ml_screennames(data_pkl, seq_len=50):
    X_mydata_screennames = None
    with open(data_pkl, "rb") as f:
        X_mydata_screennames = pickle.load(f)

    # lowercase
    X_mydata_screennames = [x.lower() for x in X_mydata_screennames]
    
    characters_ = [[x for x in y] for y in X_mydata_screennames]
    characters = [item for sublist in characters_ for item in sublist]
    vocab = list(set(characters))
    vocab = sorted(vocab)
    print("lenth of vocab: {}".format(len(vocab)))

    char2idx = {char:i for i, char in enumerate(vocab)}
    idx2char = np.array(vocab)

    # encoded screennames
    X_mydata_screennames_encoded = [[char2idx[c] for c in n] for n in X_mydata_screennames]

    X_mydata_screennames_encoded_padded = tf.keras.preprocessing.sequence.pad_sequences(
        X_mydata_screennames_encoded, padding="pre", maxlen=seq_len, value=len(vocab))
    
    return X_mydata_screennames_encoded_padded

# Define Models

In [None]:
# image model
def image_classifier():
    """ Use features from VGG16 for binary classification """
    tf.keras.backend.set_learning_phase(1)
    tf.keras.backend.clear_session()
#     model = Sequential()
#     model.add(Dense(1024, input_shape=input_shape, activation="relu"))
#     model.add(Dense(1, activation="sigmoid"))
    img_inputs = tf.keras.Input(shape=(25088,), dtype="float32")
    img_x_ = tf.keras.layers.Dense(1024, activation="relu")(img_inputs)
    img_x_ = tf.keras.layers.Dense(256, activation="relu")(img_x_)
      
    # output
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(img_x_)
    model = tf.keras.Model(img_inputs, outputs)
    print(model.summary())
    
    return model

# screen name model
def screenname_classifier():
    tf.keras.backend.set_learning_phase(1)
    tf.keras.backend.clear_session()
    inputs = tf.keras.Input(shape=(None,), dtype="int8")
    x = tf.keras.layers.Embedding(input_dim=50, output_dim=100)(inputs)
    x = tf.keras.layers.Bidirectional(GRU(128))(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    
    model = tf.keras.Model(inputs, outputs)
    print(model.summary())
    
    return model

# stat model
def mlp_classifier():
    tf.keras.backend.set_learning_phase(1)
    tf.keras.backend.clear_session()
    inputs = tf.keras.Input(shape=(13,), dtype="float32")
    x = tf.keras.layers.Dense(128, activation="relu")(inputs)
    x = tf.keras.layers.Dense(64, activation="relu")(x)
    x = tf.keras.layers.Dense(32, activation="relu")(x)
    x = tf.keras.layers.Dense(16, activation="relu")(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    
    model = tf.keras.Model(inputs, outputs)
    print(model.summary())
    
    return model

def sn_stat_classifier():
    tf.keras.backend.set_learning_phase(1)
    tf.keras.backend.clear_session()
    stat_inputs = tf.keras.Input(shape=(13,), dtype="float32")
    stat_x = tf.keras.layers.Dense(128, activation="relu")(stat_inputs)
    
    sn_inputs = tf.keras.Input(shape=(None,), dtype="int8")
    sn_x = tf.keras.layers.Embedding(input_dim=50, output_dim=100)(sn_inputs)
    sn_x = tf.keras.layers.Bidirectional(GRU(128))(sn_x)
    sn_x = tf.keras.layers.Dense(128, activation="relu")(sn_x)
    
    # concat
    x = tf.keras.layers.concatenate([sn_x, stat_x], axis=1)
    
    # output
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)
    model = tf.keras.Model([stat_inputs, sn_inputs], outputs)
    print(model.summary())
    
    return model

def img_sn_stat_classifier():
    tf.keras.backend.set_learning_phase(1)
    tf.keras.backend.clear_session()
    
    stat_inputs = tf.keras.Input(shape=(13,), dtype="float32")
    stat_x = tf.keras.layers.Dense(128, activation="relu")(stat_inputs)
    
    sn_inputs = tf.keras.Input(shape=(None,), dtype="int8")
    sn_x = tf.keras.layers.Embedding(input_dim=50, output_dim=100)(sn_inputs)
    sn_x = tf.keras.layers.Bidirectional(GRU(128))(sn_x)
    sn_x = tf.keras.layers.Dense(128, activation="relu")(sn_x)
    
#     # concat
    non_img_x_ = tf.keras.layers.concatenate([sn_x, stat_x], axis=1)
    
    # image part
    img_inputs = tf.keras.Input(shape=(25088,), dtype="float32")
    img_x_ = tf.keras.layers.Dense(1024, activation="relu")(img_inputs)
    img_x_ = tf.keras.layers.Dense(256, activation="relu")(img_x_)
    
    # manual attention part
    img_inputs_sum = tf.math.reduce_sum(img_inputs, axis=1, keepdims=True)
    print("img_inputs_sum", img_inputs_sum.shape)
    img_inputs_retrived = tf.math.greater(img_inputs_sum, tf.constant([0.]))
    img_inputs_retrived = tf.cast(img_inputs_retrived, tf.float32)
    print("img_inputs_retrived", img_inputs_retrived.shape)
    
    alpha = tf.math.multiply(tf.reshape(np.log10(2.)-stat_inputs[:, 4], [-1,1]), img_inputs_retrived)
    img_x = tf.math.multiply(alpha, img_x_)
    non_img_x = tf.math.multiply(tf.reshape(np.log10(2.)-alpha, [-1,1]), non_img_x_)
    fused_x = tf.math.add(img_x, non_img_x)
    
    # output
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(fused_x)
    model = tf.keras.Model([stat_inputs, sn_inputs, img_inputs], outputs)
    print(model.summary())
    
    return model

# Get and Store Image Features

In [None]:
%%time
#############################
# get image features

data = humanizer_data

# load VGG16
model = tf.keras.applications.VGG16(
    include_top=False,
    weights="imagenet",
    input_tensor=None,
    input_shape=None,
    pooling=None,
    classes=1000
)

# feature dict
img_feature_dict = dict()
feature_shape = 25088
for idx, uid in enumerate(data.keys()):
    if idx % 100 == 0:
        print(idx, uid)
    try:
        uid, features = get_image_features(model, data[uid]['user'].AsJsonString())
#         print(features.shape)
        img_feature_dict[uid] = features
    except Exception as e:
        print(uid, e)
        img_feature_dict[uid] = [0]*feature_shape


X_train = np.array(list(img_feature_dict.values()))
np.save("data/X_np_humanizer_image_features", X_train)

# Get and Store Statistical Features

In [None]:
%%time
# store statistical features
start_time = time.time()

X_train = list()
y_train = list()
for uid in data.keys():
    y_train.append(data[uid]['label'])
    json_data = json.loads(data[uid]['user'].AsJsonString())
    X_train.append(get_stat_features(json_data))

y_train = np.array(y_train)
X_train = np.array(X_train)

print(X_train.shape, y_train.shape)

elapsed_time = time.time() - start_time

print("Used {} seconds to load data".format(elapsed_time))

np.save("data/X_np_humanizer_stat_features", X_train)
np.save("data/y_np_humanizer", y_train)

# Get and Store Twitter Screennames

In [None]:
%%time
# get all screennames
screennames = list()
for uid in data.keys():
    json_data = json.loads(data[uid]['user'].AsJsonString())
    screennames.append(json_data["screen_name"])
    
with open("data/humanizer_screennames.pkl", "wb") as f:
    pickle.dump(screennames, f)

# Data Preparation for training

In [None]:
%%time
# train my proposed architecture

# load data
X_train_img = np.load("data/X_np_training_image_features.npy")
X_train_screennames = get_ml_screennames("data/training_data_screennames.pkl")
X_train_stat, y_train = get_ml_stat_data()

print(X_train_screennames.shape, X_train_stat.shape, X_train_img.shape)

In [None]:
# get val
X_train_stat_, X_val_stat_, y_train_stat_, y_val_stat_ = train_test_split(
    X_train_stat, y_train, test_size=0.33, random_state=42, stratify=y_train)

X_train_sn_, X_val_sn_, y_train_sn_, y_val_sn_ = train_test_split(
    X_train_screennames, y_train, test_size=0.33, random_state=42, stratify=y_train)

X_train_img_, X_val_img_, y_train_img_, y_val_img_ = train_test_split(
    X_train_img, y_train, test_size=0.33, random_state=42, stratify=y_train)

print("Validation split finished...")

# Model training and eval - Image Classifier
## Training

In [None]:
%%time
# train img classifier
# X_train_, X_val, y_train_, y_val = train_test_split(
#     X_train, y_train, test_size=0.33, random_state=42, stratify=y_train)

# print("Split completed...")

# input_shape = X_train.shape[1:]
model = image_classifier()

# compile
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
# early stopping
es = EarlyStopping(monitor="val_accuracy", mode="max", verbose=1, 
                   patience=10,
                   restore_best_weights=True)

# fit
model.fit(X_train_img_, y_train_stat_, 
          validation_data=(X_val_img_, y_val_stat_), 
          epochs=500, 
          batch_size=1024,
          verbose=1, 
          callbacks=[es])

# store the model
model.save_weights("./model/img_clf")

## Load Trained Model and Testing

In [None]:
%%time
# evaluate on mydata
X_test_mydata = np.load("data/X_np_mydata_image_features.npy")
y_test_mydata = np.load("data/y_np_mydata.npy")

model = image_classifier()
model.load_weights("./model/img_clf")

pred = model.predict(X_test_mydata)
pred = np.array(pred>=.5).astype(int)
y_test_mydata = np.array([int(x) for x in y_test_mydata])

from sklearn.metrics import classification_report
print(classification_report(y_test_mydata, 
                            pred,
                            labels = [1,0],
                            target_names=["ind","org"]))

with open("output/mydata_img_classifier.tsv", "w") as f:
    for idx, uid in enumerate(mydata.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

In [None]:
%%time
# evaluate on humanizer data
X_test_humanizer = np.load("data/X_np_humanizer_image_features.npy")
y_test_humanizer = np.load("data/y_np_humanizer.npy")

model = image_classifier()
model.load_weights("./model/img_01/img_clf")

pred = model.predict(X_test_humanizer)
pred = np.array(pred>=.5).astype(int)
y_test_humanizer = np.array([int(x) for x in y_test_humanizer])

from sklearn.metrics import classification_report
print(classification_report(y_test_humanizer, 
                            pred,
                            labels = [1,0],
                            target_names=["ind","org"]))

with open("output/humanizer_img_classifier.tsv", "w") as f:
    for idx, uid in enumerate(data.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

# Model training and eval - Screenname-based Classifier
## Training

In [None]:
%%time
#######################################
# GRU screenname based classifier
model = screenname_classifier()

# compile
opt = tf.keras.optimizers.Adam(learning_rate=.01)
model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])
# early stopping
es = EarlyStopping(monitor="val_accuracy", mode="max", verbose=1, 
                   patience=10,
                   restore_best_weights=True)

# # get val
# X_train_, X_val, y_train_, y_val = train_test_split(
#     screennames_encoded_padded, y_train, test_size=0.2, random_state=42)

# fit
model.fit(X_train_sn_, y_train_sn_, 
          validation_data=(X_val_sn_, y_val_sn_), 
          epochs=500, 
          batch_size=1024,
          verbose=1, 
          callbacks=[es])

# store the model
model.save_weights("./model/screenname_clf")

## Load Trained Model and Testing

In [None]:
%%time
# test with my data
model = screenname_classifier()
model.load_weights("./model/screenname_clf")

X_mydata_screennames_encoded_padded = get_ml_screennames("data/mydata_screennames.pkl") 
y_mydata = np.load("data/y_np_mydata.npy")

print(len(X_mydata_screennames_encoded_padded), len(y_mydata))

pred = model.predict(X_mydata_screennames_encoded_padded)
pred = np.array(pred>=.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_mydata, 
                            pred,
                            labels = [1, 0],
                            target_names=["ind","org"]))

with open("output/mydata_screenname_classifier.tsv", "w") as f:
    for idx, uid in enumerate(mydata.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

In [None]:
%%time
# test with humanizer data
X_humanizer_screennames_encoded_padded = get_ml_screennames("data/humanizer_screennames.pkl") 
y_humanizer = np.load("data/y_np_humanizer.npy")

print(len(X_humanizer_screennames_encoded_padded), len(y_humanizer))

model = screenname_classifier()
model.load_weights("model/screenname_clf")

pred = model.predict(X_humanizer_screennames_encoded_padded)
pred = np.array(pred>=.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_humanizer, 
                            pred,
                            labels = [1, 0],
                            target_names=["ind","org"]))

with open("output/humanizer_screenname_classifier.tsv", "w") as f:
    for idx, uid in enumerate(data.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

# Model training and eval - MLP with Statistical Features Classifier
## Training

In [None]:
%%time
##################################
# mlp classifier
model = mlp_classifier()

# compile
# opt = tf.keras.optimizers.Adam(learning_rate=.01)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
# early stopping
es = EarlyStopping(monitor="val_accuracy", mode="max", verbose=1, 
                   patience=10,
                   restore_best_weights=True)

# get val
# X_train_, X_val, y_train_, y_val = train_test_split(
#     X_train, y_train, test_size=0.2, random_state=42)

# fit
model.fit(X_train_stat_, y_train_stat_, 
          validation_data=(X_val_stat_, y_val_stat_), 
          epochs=500, 
          batch_size=1024,
          verbose=1, 
          callbacks=[es])

# store the model
model.save_weights("./model/mlp_clf")

## Load Trained Model and Testing

In [None]:
%%time
# test with my data
model = mlp_classifier()
model.load_weights("./model/mlp_clf")

X_mydata_stat, y_mydata_stat, mydata = get_ml_stat_data("data/mydata.pkl")
pred = model.predict(X_mydata_stat)
pred = np.array(pred>=.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_mydata_stat, 
                            pred,
                            labels = [1, 0],
                            target_names=["ind","org"]))

with open("output/mydata_stat_classifier.tsv", "w") as f:
    for idx, uid in enumerate(mydata.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

In [None]:
%%time
# test with humanizer data
model = mlp_classifier()
model.load_weights("./model/mlp_clf")

# X_mydata_stat, y_mydata_stat, mydata = get_ml_stat_data("data/mydata.pkl")
X_humanizer_stat = np.load("data/X_np_humanizer_stat_features.npy")
y_humanizer_stat = np.load("data/y_np_humanizer.npy")

pred = model.predict(X_humanizer_stat)
pred = np.array(pred>=.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_humanizer_stat, 
                            pred,
                            labels = [1, 0],
                            target_names=["ind","org"]))

with open("output/humanizer_stat_classifier.tsv", "w") as f:
    for idx, uid in enumerate(data.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

# Model training and eval - Statistical Features & Screennames Classifier
## Training

In [None]:
%%time
########################
# stat & screennames

model = sn_stat_classifier()
# compile
# opt = tf.keras.optimizers.Adam(learning_rate=.01)
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
# early stopping
es = EarlyStopping(monitor="val_accuracy", mode="max", verbose=1, 
                   patience=10,
                   restore_best_weights=True)

# # get val
# X_train_stat, X_val_stat, y_train_stat, y_val_stat = train_test_split(
#     X_train, y_train, test_size=0.2, random_state=42)
# # get val
# X_train_sn, X_val_sn, y_train_sn, y_val_sn = train_test_split(
#     screennames_encoded_padded, y_train, test_size=0.2, random_state=42)

# fit
model.fit([X_train_stat_, X_train_sn_], y_train_stat_, 
          validation_data=([X_val_stat_, X_val_sn_], y_val_stat_), 
          epochs=500, 
          batch_size=1024,
          verbose=1, 
          callbacks=[es])

# store the model
model.save_weights("./model/stat_sn_clf")

## Load Trained Model and Testing

In [None]:
%%time
# test with mydata
X_mydata_screennames = get_ml_screennames("data/mydata_screennames.pkl")
X_mydata_stat, y_mydata_stat, mydata = get_ml_stat_data("data/mydata.pkl")

# load model
model = sn_stat_classifier()
model.load_weights("./model/stat_sn_clf")

pred = model.predict([X_mydata_stat, X_mydata_screennames_encoded_padded])
pred = np.array(pred>=.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_mydata_stat, 
                            pred,
                            labels = [1, 0],
                            target_names=["ind","org"]))

with open("output/mydata_stat_sn_classifier.tsv", "w") as f:
    for idx, uid in enumerate(mydata.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

In [None]:
%%time
# test with humanizer data
X_humanizer_screennames = get_ml_screennames("data/humanizer_screennames.pkl")
X_humanizer_stat = np.load("data/X_np_humanizer_stat_features.npy")
y_humanizer_stat = np.load("data/y_np_humanizer.npy")

# load model
model = sn_stat_classifier()
model.load_weights("./model/stat_sn_clf")

pred = model.predict([X_humanizer_stat, X_humanizer_screennames])
pred = np.array(pred>=.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_humanizer_stat, 
                            pred,
                            labels = [1, 0],
                            target_names=["ind","org"]))

with open("output/humanizer_stat_sn_classifier.tsv", "w") as f:
    for idx, uid in enumerate(data.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

# Model training and eval - Proposed Classifier
## Training

In [None]:
%%time
#############################
# proposed model
model = img_sn_stat_classifier()
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
# early stopping
es = EarlyStopping(monitor="val_accuracy", mode="max", verbose=1, 
                   patience=0,
                   restore_best_weights=True)

# fit
model.fit([X_train_stat_, X_train_sn_, X_train_img_], y_train_stat_, 
          validation_data=([X_val_stat_, X_val_sn_, X_val_img_], y_val_stat_), 
          epochs=500, 
          batch_size=1024,
          verbose=1, 
          callbacks=[es])

# store the model
model.save_weights("./model/img_stat_sn_clf")

## Load Trained Model and Testing

In [None]:
%%time
# test with mydata
X_mydata_screennames_encoded_padded = get_ml_screennames("data/mydata_screennames.pkl")
X_mydata_stat, y_mydata_stat, mydata = get_ml_stat_data("data/mydata.pkl")
X_mydata_img = np.load("data/X_np_mydata_image_features.npy")

print("Loaing data finished...")

model = img_sn_stat_classifier()
model.load_weights("model/img_stat_sn_01/img_stat_sn_clf")

pred = model.predict([X_mydata_stat, X_mydata_screennames_encoded_padded, X_mydata_img])
pred = np.array(pred>=.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_mydata_stat, 
                            pred,
                            labels = [1, 0],
                            target_names=["ind","org"], 
                            digits=2
                           ))

with open("output/mydata_img_stat_sn_classifier.tsv", "w") as f:
    for idx, uid in enumerate(mydata.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

In [None]:
%%time
# test with humanizer data
X_humanizer_screennames_encoded_padded = get_ml_screennames("data/humanizer_screennames.pkl")
X_humanizer_stat = np.load("data/X_np_humanizer_stat_features.npy")
y_humanizer_stat = np.load("data/y_np_humanizer.npy")
X_humanizer_img = np.load("data/X_np_humanizer_image_features.npy")

print("Loaing data finished...")

model = img_sn_stat_classifier()
model.load_weights("model/img_stat_sn_01/img_stat_sn_clf")

pred = model.predict([X_humanizer_stat, X_humanizer_screennames_encoded_padded, X_humanizer_img])
pred = np.array(pred>=.5).astype(int)

from sklearn.metrics import classification_report
print(classification_report(y_humanizer_stat, 
                            pred,
                            labels = [1, 0],
                            target_names=["ind","org"], 
                            digits=2
                           ))

with open("output/humanizer_img_stat_sn_classifier.tsv", "w") as f:
    for idx, uid in enumerate(data.keys()):
        f.write("{}\t{}\n".format(uid, pred[idx][0]))

# Inlined test for mydata

In [None]:
pred_file = "output/mydata_RF_stat_classifier.tsv"

with open("data/mydata.pkl", "rb") as f:
    mydata = pickle.load(f)

# mydata inlined evaluation
# get humanizer output
houtput = pd.read_csv("output/mydata_humanizer.tsv", 
                      delimiter="\t",
                      header=None,
                      names=["user_id", "label"])
mydata_uids = houtput["user_id"].values

sn_classifier_output = pd.read_csv(pred_file, 
                      delimiter="\t",
                      header=None,
                      names=["user_id", "label"])
sn_classifier_output = sn_classifier_output[sn_classifier_output["user_id"].isin(mydata_uids)]

sn_classifier_output["ground_truth"] = \
    [int(mydata[x]["label"]) for x in sn_classifier_output["user_id"].values]

from sklearn.metrics import classification_report
print(classification_report(sn_classifier_output["ground_truth"].values, 
                            sn_classifier_output["label"].values,
                            labels = [1, 0],
                            target_names=["ind","org"]))