In [1]:
from __future__ import division
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from pgmpy.estimators import BdeuScore, K2Score, BicScore
from pgmpy.models import BayesianModel
from sklearn import preprocessing
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, MatrixFactorization, IterativeSVD, BiScaler
from sklearn import mixture
from sklearn.cluster import DBSCAN
from sklearn import metrics
from catboost import CatBoostClassifier, CatBoost, CatBoostRegressor
from itertools import combinations
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed
import warnings
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from pomegranate import *

warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Create simulation datasets

In [2]:
def create_sim_dataset(test, validation, n_samples, n_rows,  n_sets):
    
#     test = pd.read_csv("../data/test_kor.csv")
    frame = ~test.isna()
    
    validation = validation.reindex(frame.columns, axis=1)
    
    result_lst = []
    
    for _ in range(n_sets):

        validation_sampled = validation.sample(n=n_samples, replace=False)

        n_split = n_samples / n_rows

        splited_indexes = np.array_split(validation_sampled.index, n_split)

        problem_df_lst = []
        answer_df_lst = []
        for idx in splited_indexes:
            selected_df = validation.loc[idx]
            selected_df.reset_index(drop=True, inplace=True)
            problem_df_lst.append(selected_df[frame])
            answer_df_lst.append(selected_df)

        problem_df = pd.concat(problem_df_lst)
        answer_df = pd.concat(answer_df_lst)
        result_lst.append({
            'problem': problem_df, 
            'answer': answer_df
                           
        })
    return result_lst
        

## Create metric for scoring


In [3]:
def calculate_numerical_score(pred_df, true_df, B,  sj=1):
    diff = np.concatenate(pred_df[numeric_predictor].values - true_df[numeric_predictor].values)
    diff = diff[~np.isnan(diff)]
    return B * np.sum(np.exp(-((diff/sj) ** 2)))

def calculate_categorical_score(pred_df, true_df, C):
    return C * np.sum(pred_df[categorical_predictor].values == true_df[categorical_predictor].values)

def calculate_total_score(pred_df, true_df, sj, B, C):
    
    return calculate_numerical_score(pred_df, true_df, B=B, sj=sj) + calculate_categorical_score(pred_df, true_df, C=C)

In [4]:
def reverse_dummy(df_dummies):
    pos = defaultdict(list)
    vals = defaultdict(list)

    for i, c in enumerate(df_dummies.columns):
        if "_" in c:
            k, v = c.split("_", 1)
            pos[k].append(i)
            vals[k].append(v)
        else:
            pos["_"].append(i)

    df = pd.DataFrame({k: pd.Categorical.from_codes(
                              np.argmax(df_dummies.iloc[:, pos[k]].values, axis=1),
                              vals[k])
                      for k in vals})

    df[df_dummies.columns[pos["_"]]] = df_dummies.iloc[:, pos["_"]]
    return df

In [5]:
def get_cat_mapping_set(df):
    cat_mapping_set = {}
    col_names = df.columns
    for col_name in col_names:
        le = preprocessing.LabelEncoder()
        result = le.fit(df[col_name])    
        cat_mapping_set[col_name] = le
    
    return cat_mapping_set

def encode_df(df, cat_mapping_set):
    tmp_dic_lst = []
    col_names = df.columns
    for col_name in col_names:
        tmp_dic_lst.append(cat_mapping_set[col_name].transform(df[col_name]))
    tmp_dic_lst = np.array(tmp_dic_lst)    
    combined_df_encoded = pd.DataFrame(np.matrix(tmp_dic_lst.T), columns=col_names)
    return combined_df_encoded

def decode_df(df, cat_col_names, cat_mapping_set):
    tmp_dic_lst2 = []
    for col_name in cat_col_names:
        tmp_dic_lst2.append([cat_mapping_set[col_name].inverse_transform(df[col_name])])
    tmp_dic_lst2 = np.array(tmp_dic_lst2)  
    combined_df_decoded = pd.DataFrame(np.matrix(tmp_dic_lst2.T), columns=cat_col_names) 
    return combined_df_decoded

In [6]:
def get_problem(type_q, row):
    tmp = []
    for idx in range(len(row)):
        if idx in type_q:
            tmp.append(None)
        else:
            tmp.append(row[idx])
    tmp = np.array(tmp)
    return tmp

## Download file from S3

In [7]:
def download_files(filenames):
    
    import boto3
    import botocore
    
    s3 = boto3.resource('s3')
    
    BUCKET_NAME = 'kaj011'
    
    for filename in filenames:
        KEY = 'samsung_challenge/%s' % filename
        
        
        try:
            s3.Bucket(BUCKET_NAME).download_file(KEY, '../data/%s' % filename)
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                print("The object does not exist.")
            else:
                raise
    
    