In [1]:
%load_ext autoreload
%autoreload 2

# Dataset Name

In [2]:
DATASET_NAME = "Criminal Forum" 

# [All] Load Basic Libraries

In [3]:
import os
import sys
import json
import logging
import numpy as np
import csv
from texas.store import DataStore
from texas.api.embeddings import learn_embedding
from texas.api.projection import learn_projection
from texas.clients import get_instance
from shutil import copyfile
csv.field_size_limit(sys.maxsize)
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')
DATA_FOLDER = "/data"

# [All] Load Dataset Specification

In [8]:
dataset_scripts = "{data_folder}/{dataset}/scripts".format(dataset=DATASET_NAME, data_folder=DATA_FOLDER)
if dataset_scripts not in sys.path:
    sys.path.append(dataset_scripts)

from import_info import get_spec
SPEC, data_dunctions = get_spec()

### Check

In [9]:
print(json.dumps(SPEC, indent=4))

{
    "data_source": {
        "connection": {
            "type": "Folder",
            "connection": {
                "path": "/data/Criminal Forum/raw/",
                "file_extension": "txt",
                "id_pattern": "0-initiator\\K[0-9]+",
                "folder_field": "Forum"
            }
        },
        "idField": "id"
    },
    "data_destination": {
        "connection": {
            "type": "ElasticSearch",
            "name": "Criminal Forum",
            "connection": {
                "client": {
                    "hosts": [
                        {
                            "host": "elasticsearch",
                            "port": "9200",
                            "path": ""
                        }
                    ]
                },
                "index": "criminal-forum",
                "type": "document"
            }
        },
        "skip_creation": false,
        "delete_if_exists": true,
        "text_fields": [
            "tex

# [Import] Load Origin DataSource

In [10]:
data_source_info = SPEC["data_source"]
data_source_connection = data_source_info["connection"]
data_source = get_instance(data_source_connection["type"],data_source_connection["connection"])

In [11]:
def get_source_data():
    data_source_gen = data_source.scan(idField=data_source_info.get("idField", None))    
    if "transform" in data_dunctions:
        data_source_gen = data_dunctions["transform"](data_source_gen)
    return data_source_gen

### Check

In [12]:
for _id, d in get_source_data():
    print("ID: ", _id)
    print(json.dumps(d, indent=4))
    break

ID:  4087116
{
    "id": "4087116",
    "Forum": "Server Stress Testing",
    "text": "Buying DSTAT SERVICE\n\n\n\t\t\t\t\tI need this one done quick. PM me so we can get it going.\n\t\t\t\t"
}


# [All] Instantiate Dataset Object

In [13]:
dataset_info = SPEC["data_destination"]
dataset_connection = dataset_info["connection"]
dataset = get_instance(dataset_connection["type"], dataset_connection["connection"])

# [Import] (Re)Create Index

In [14]:
if dataset_info.get("delete_if_exists", False):
    dataset.delete_repository()



NotFoundError: TransportError(404, 'index_not_found_exception', 'no such index')

In [15]:
if not dataset_info.get("skip_creation", False):
    text_fields = dataset_info["text_fields"]
    other_fields = dataset_info.get("other_fields")
    text_fields = [{"ID": f, "Type": "TEXT"} for f in text_fields]
    dataset.create_repository({"Fields":text_fields + other_fields}, settings=dataset_info.get("settings", None))

2018-02-27 22:44:04,319 INFO PUT http://elasticsearch:9200/criminal-forum [status:200 request:1.314s]


# [Import]

In [16]:
dataset.write(get_source_data())
print("done")

2018-02-27 22:44:11,027 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.922s]
2018-02-27 22:44:14,303 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.394s]
2018-02-27 22:44:17,022 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.223s]
2018-02-27 22:44:19,594 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.222s]
2018-02-27 22:44:22,594 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.230s]
2018-02-27 22:44:27,327 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.450s]
2018-02-27 22:44:30,615 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.269s]
2018-02-27 22:44:33,609 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.328s]
2018-02-27 22:44:36,580 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.355s]
2018-02-27 22:44:39,066 INFO POST http://elasticsearch:9200/_bulk [status:200 request:0.309s]
2018-02-27 22:44:42,726 INFO POST http://elasticsearch:9200/

## Check Import

In [17]:
for d in dataset.scan():
    print(d)
    break

2018-02-27 22:46:28,225 INFO GET http://elasticsearch:9200/criminal-forum/document/_search?scroll=5m&size=1000 [status:200 request:0.255s]
2018-02-27 22:46:28,253 INFO DELETE http://elasticsearch:9200/_search/scroll [status:200 request:0.015s]


('4164713', {'id': '4164713', 'Forum': 'Premium Sellers Section', 'text': "Skype API's $5.00\n\n\n\t\t\t\t\tOkay so I have a Skype system coming along and I was about to sell these to friends but got out of hand so yeah :/\n\nLike the title says:\n\n$5.00 per lifetime \n\n99.9% downtime.\n\n\n\nMy resolver uses it.\n\n http://www.eResolve.info \n\n\n\nContact me on skype for business.\n\nSkype: Live:Mrflippers\n\t\t\t\t"})


# [Import] Add dataset to TexasAPI (Only First Time)

In [18]:
texas_connection = SPEC["texas"]["connection"]
texas_client = get_instance(texas_connection["type"], texas_connection["connection"])

texas_query = """
    mutation createDataset($config:JSON!, $name: String!) {
      System {
        createDataset(dataset: {
          Name: $name,
          Provider: "ElasticSearch"
          Config: $config
        }){
          ID
          Name
        }   
      }
    }"""
print({
    "name": dataset_connection["name"],
    "config": dataset_connection["connection"]
})

http://texas-api:4200/api
{'name': 'Criminal Forum', 'config': {'client': {'hosts': [{'host': 'elasticsearch', 'port': '9200', 'path': ''}]}, 'index': 'criminal-forum', 'type': 'document'}}


In [19]:
texas_dataset = texas_client.query({"query":texas_query, "variables": {
    "name": dataset_connection["name"],
    "config": dataset_connection["connection"]
}})
texa_ID = texas_dataset["data"]["System"]["createDataset"]["ID"]
print("ID: ",texa_ID)

ID:  1
