## Import libraries

In [None]:
import sys

sys.path.append("/home/jovyan/work/sem-covid/")
sys.path = list(set(sys.path))
import os

os.getcwd()
os.chdir('/home/jovyan/work/sem-covid/')
from sem_covid.services.store_registry import store_registry
from sem_covid import config
from sem_covid.services.model_registry import embedding_registry

import pandas as pd
import sklearn.metrics
import sklearn.model_selection
import autosklearn.classification

from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier

## Define constants

In [None]:
UNIFIED_DATASET = 'ds_unified_datasets'

In [None]:
es_store = store_registry.es_index_store()

## Load data

In [None]:
pwdb_df = es_store.get_dataframe(index_name=config.PWDB_ELASTIC_SEARCH_INDEX_NAME)
unified_df = es_store.get_dataframe(index_name=UNIFIED_DATASET)
emb_model = embedding_registry.sent2vec_universal_sent_encoding()

## Compute embeddings based on textual fields

In [None]:
unified_df = pd.DataFrame(unified_df[unified_df.Document_source == 'pwdb'])
unified_df['text'] = unified_df[['Title', 'Content']].agg(' '.join, axis=1)
unified_df['emb'] = emb_model.encode(unified_df['text'].values)


In [None]:
BINARY_COLUMNS = ['businesses', 'citizens', 'workers']
STRING_COLUMNS = ['category', 'subcategory', 'type_of_measure']
LIST_COLUMNS = ['target_groups','actors', 'funding']
NON_BINARY_COLUMNS = STRING_COLUMNS + LIST_COLUMNS
TRAIN_COLUMNS = BINARY_COLUMNS + NON_BINARY_COLUMNS

transformed_df = pd.DataFrame()
tmp_transformed_df = pd.DataFrame()
for column in STRING_COLUMNS:
    transformed_df[column] = pwdb_df[column].str.split(',').apply(lambda x: list(map(str.strip, x))).copy()

for column in LIST_COLUMNS:
    transformed_df[column] = pwdb_df[column]

for column in NON_BINARY_COLUMNS:
    tmp_transformed_df = pd.concat([tmp_transformed_df,
           pd.get_dummies(transformed_df[column].apply(pd.Series), prefix=column)],
          axis=1)

transformed_df = pd.concat([tmp_transformed_df,pwdb_df[BINARY_COLUMNS]],axis=1)

In [None]:
tmp_df = pd.DataFrame()
tmp_df['emb'] = unified_df['emb'].copy()

In [None]:
x_df = tmp_df.emb.apply(pd.Series)
y_df = transformed_df

In [None]:
z1 = set(x_df.index)
z2 = set(y_df.index)
z3 = list(z1.intersection(z2))

In [None]:
x_df = x_df.loc[z3]
y_df = y_df.loc[z3]

In [None]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x_df,y_df,random_state=1)

In [None]:
forest = RandomForestClassifier(random_state=1)

In [None]:
multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

In [None]:
predictions = multi_target_forest.fit(x_train, y_train).predict(x_test)

In [None]:
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))

In [None]:
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=600,
    per_run_time_limit=300,
    ensemble_size=10,
    n_jobs=16,
    memory_limit = None,
    #Flags for fast test
    initial_configurations_via_metalearning=0,
    smac_scenario_args={'runcount_limit': 1},
)
automl.fit(x_train, y_train)

In [None]:
print(automl.leaderboard())

In [None]:
print(automl.show_models())

In [None]:
print(automl.sprint_statistics())

In [None]:
predictions = automl.predict(x_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))
