In [3]:
import sys
sys.path.insert(0, '/home/ubuntu/praxi')

from multiprocessing import Lock

import logging
import logging.config

import os
from os import listdir
from os.path import isfile, join

from pathlib import Path
import random
import time
import yaml
import pickle
import copy
import argparse

from sklearn.base import BaseEstimator
from tqdm import tqdm

import numpy as np
from numpy import savetxt
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

from hybrid import Hybrid

CHANGESET_ROOT = Path('~/praxi/caches/changesets/').expanduser()

In [4]:
def get_free_filename(stub, directory, suffix=''):
    counter = 0
    while True:
        file_candidate = '{}/{}-{}{}'.format(
            str(directory), stub, counter, suffix)
        if Path(file_candidate).exists():
            print("file exists")
            counter += 1
        else:  # No match found
            print("no file")
            if suffix=='.p':
                print("will create pickle file")
            elif suffix:
                Path(file_candidate).touch()
            else:
                Path(file_candidate).mkdir()
            return file_candidate

def parse_csids(csids, multilabel=False, iterative=False):
    """ Returns labels and features from csids, features are file sets
    file sets: list of string of format '644 /usr/.../file' """
    features = []
    labels = []
    for csid in tqdm(csids):
        changeset = get_changeset(csid, iterative=iterative)
        if multilabel:
            if 'labels' in changeset:
                labels.append(changeset['labels'])
            else:
                labels.append([changeset['label']])
        else:
            labels.append(changeset['label'])
        features.append(changeset['changes'])
    return features, labels

def get_changeset(csid, iterative=False):
    changeset = None
    if str(csid) in {'5', '6', '7'}:
        # Dirty fix for finger, autotrace
        globstr = '*[!16].5'
    else:
        globstr = '*.{}'.format(csid)
    if iterative:
        globstr += '.yaml'
    else:
        globstr += '.[!y]*'
    for csfile in CHANGESET_ROOT.glob(globstr):
        if changeset is not None:
            raise IOError(
                "Too many changesets match the csid {}, globstr {}".format(
                    csid, globstr))
        with csfile.open('r') as f:
            changeset = yaml.load(f)
    if changeset is None:
        raise IOError("No changesets match the csid {}".format(csid))
    if 'changes' not in changeset or (
            'label' not in changeset and 'labels' not in changeset):
        logging.error("Malformed changeset, id: %d, changeset: %s",
                      csid, csfile)
        raise IOError("Couldn't read changeset")
    return changeset



In [9]:
# GET ITERATIVE CHUNKS
with (Path('/home/ubuntu/praxi/changeset_sets/').expanduser() / 'iterative_chunks.p').open('rb') as f:
    it_chunks = pickle.load(f)

print(len(it_chunks))
print(type(it_chunks))

print(len(it_chunks[0]))
print(type(it_chunks[0]))
print(it_chunks[0])

4
<class 'list'>
3
<class 'list'>
[[20006, 30922, 23869, 20095, 28710, 37927, 21525, 18464, 18274, 33860, 28580, 36011, 37491, 40988, 27809, 35662, 41007, 33390, 20548, 26362, 34346, 19923, 41851, 28098, 29673, 29236, 19635, 19358, 35675, 41618, 24317, 36559, 33557, 29704, 31830, 35476, 32828, 22543, 33389, 38928, 27538, 29805, 35811, 29607, 31108, 26101, 38581, 41341, 22290, 38214, 35094, 19666, 35513, 22492, 41758, 27635, 28785, 40088, 30843, 25298, 38500, 30602, 21724, 27103, 107206, 19385, 32760, 26674, 28165, 35816, 31852, 37459, 35524, 22166, 19433, 22967, 22478, 32823, 39022, 40788, 21059, 40250, 27638, 25277, 33280, 27155, 37592, 107046, 22279, 18020, 38463, 24577, 23434, 19314, 21667, 40280, 25437, 27323, 30246, 18111, 24555, 30739, 18886, 107140, 24739, 19519, 35422, 30063, 34526, 20804, 37635, 20998, 38290, 19624, 29826, 39216, 107360, 107364, 27893, 39686, 106901, 33946, 29632, 31770, 18617, 24402, 26655, 38032, 17958, 31781, 27151, 106994, 23136, 23132, 39397, 39018, 21212

In [8]:
print(type(it_chunks))

<class 'list'>
