In [7]:
# This script generates deep learning training data
# Author: Nikita Mishra
# run: python Deep_learning_training_data.py

# TODO: remove repeated solutions and change start and end position accordingly
import sys
import json
import pandas as pd
import numpy as np
import os
from pathlib import Path
from importlib.machinery import SourceFileLoader
import api_client
from api_client import *
import random
from gensim import models


#api_client = SourceFileLoader("module.name", "../api_client.py").load_module()

DATA_DIR = 'data/'
SOLUTIONS_FILE = ''.join([DATA_DIR, 'solutions_org'])
QUERIES_FILE   = ''.join([DATA_DIR, 'queries_org'])
CAUCUSES_FILE  = ''.join([DATA_DIR, 'caucuses_org'])

MAX_OUTPUT = 1000000

MAX_LENGTH = 500
orgs_api = ApiClient('http://localhost:3002', '/v1/orgs')
def create_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    return

def read(filename):
    print('Loading from file ', filename, '...')
    with open(filename) as data_file:
        return data_file.read().splitlines()
    
def read_padded(filename, MAX_LENGTH):
    print('Loading from file ', filename, '... and adding padding.')     
    return [pad(item, MAX_LENGTH) for item in read(filename) ]

def pad(sentence,maxlength):
    LEN = len(sentence.split(' '))
    if LEN>=maxlength:
        return sentence[:maxlength]
    else:
        return ' '.join([sentence]+['BUFFER_PAD']*(maxlength-LEN))
    
def length_distribution( solutions_caucus):
    return [len(u.split(' ')) for u in solutions_caucus]


def print_dist(x):
    print('min:',min(x),'mean:',sum(x)/len(x),'max:',max(x))
    
def classify(LOGITS_positive, LOGITS_negative):
    left_point = min(LOGITS_positive+ LOGITS_negative)
    right_point = max(LOGITS_positive+ LOGITS_negative)
    errors=[]
    num_steps = 100
    gap = int(np.ceil(left_point-right_point/(num_steps - 1)))
    if gap <=0:
        print('Left should be smaller than right')
    for threshold in np.arange(left_point, right_point, gap):
        error_positive = sum(i > threshold for i in LOGITS_positive) 
        error_negative = sum(i < threshold for i in LOGITS_negative) 
        error = error_positive + error_negative
        errors.append(error)
        #print(threshold, error, error_positive, error_negative)
    print('--Summary')
    print('Positive scores_distribution:')
    print_dist(LOGITS_positive)
    print('Negative scores_distribution:')
    print_dist(LOGITS_negative)
    print('Num errors:', min(errors), ', Percent errors:',min(errors)/(len(LOGITS_negative)+len(LOGITS_positive)))
    return errors

In [8]:
# read caucus positive negative

caucus_positive = read('/Users/nikita/nmt_data/Upwork/scoring_results/chunked_caucus_positive_sources.txt')
caucus_negative = read('/Users/nikita/nmt_data/Upwork/scoring_results/chunked_caucus_negative_sources.txt')

Loading from file  /Users/nikita/nmt_data/Upwork/scoring_results/chunked_caucus_positive_sources.txt ...
Loading from file  /Users/nikita/nmt_data/Upwork/scoring_results/chunked_caucus_negative_sources.txt ...


In [9]:
caucus_positive_scores = [float(item.split(" ")[2]) for item in caucus_positive]
caucus_negative_scores = [float(item.split(" ")[2]) for item in caucus_negative]

In [14]:
errors = classify(caucus_positive_scores, caucus_negative_scores)

--Summary
Positive scores_distribution:
min: 50.9601 mean: 416.2241589001449 max: 2514.77
Negative scores_distribution:
min: 50.3164 mean: 434.5955019536907 max: 4764.79
Num errors: 1287 , Percent errors: 0.465629522431


In [14]:
# Bagging
from joblib import Parallel, delayed
import multiprocessing
from statistics import mean
from statistics import stdev
import math
def classify_sample(LOGITS_positive, LOGITS_negative):    
        ids = random.sample(range(len(LOGITS_positive)),  math.ceil(len(LOGITS_positive)*0.7) ) 
        LOGITS_positive = [LOGITS_positive[item] for item in ids ]
        LOGITS_negative = [LOGITS_negative[item] for item in ids ]

        left_point = min(LOGITS_positive+ LOGITS_negative)
        right_point = max(LOGITS_positive+ LOGITS_negative)
        errors=[]
        num_steps = 100
        gap = int(np.ceil(abs(left_point-right_point)/(num_steps - 1)))
        if gap <=0:
            print('Left should be smaller than right')
            print('Left:', left_point, ', Right:', right_point)
            return
        for threshold in np.arange(left_point, right_point, gap):
            error_positive = sum(i > threshold for i in LOGITS_positive) 
            error_negative = sum(i < threshold for i in LOGITS_negative) 
            error = error_positive + error_negative
            errors.append(error)
            #print(threshold, error, error_positive, error_negative)
        return min(errors)/(len(LOGITS_negative)+len(LOGITS_positive))
def classify_bagged(LOGITS_positive, LOGITS_negative):
    
    num_cores = multiprocessing.cpu_count()
    results = Parallel(n_jobs=num_cores)(delayed(classify_sample)(LOGITS_positive, LOGITS_negative) for i in range(10))
    print(mean(results),stdev(results))

In [15]:
errors = classify_bagged(caucus_positive_scores, caucus_negative_scores)

0.471332644628 0.003920557003008166
