In [None]:
#specify python install location above /home/ec2-user/SageMaker so that we don't have to 
#reinstall custom packages after every reboot.

import os
user_libs_path="/home/ec2-user/SageMaker/.local"

if not os.path.exists(user_libs_path):
    os.makedirs(user_libs_path)

import sys
sys.path.insert(0, user_libs_path+'/lib/python3.6/site-packages')
import site
site.USER_BASE=user_libs_path

In [None]:
import subprocess, os

my_env = os.environ.copy()
my_env["PYTHONUSERBASE"] = user_libs_path

process = subprocess.run("pip install --user -r ./app-requirements.txt"
                           .split(), env=my_env, stdout=subprocess.PIPE)
process.stderr

In [None]:
import umap
import pickle
import joblib
import random
import string
import warnings
warnings.simplefilter('ignore')
import logging
logging.getLogger("param.Dimension").setLevel(logging.CRITICAL)
logging.getLogger("param.ParameterizedMetaclass").setLevel(logging.CRITICAL)
logging.getLogger("param.PointPlot").setLevel(logging.CRITICAL)
import numpy as np
import pandas as pd

In [None]:
import holoviews as hv
hv.extension('bokeh')
import awswrangler as wr
import boto3

model_dir="../model/"
data_dir="../data/"

#role = get_execution_role()
#sess = sagemaker.Session()
#bucket = sess.default_bucket()
bucket = 'project1-lz'
s3_master_dir='master'

In [None]:
#sentiments = wr.s3.read_parquet(os.path.join("s3://",bucket,s3_raw_dir, raw_data_file), dataset=True)

#client.get_object(Body=pickle.dumps(le_cluster), Bucket=bucket, Key='preprocessing/le_cluster.pkl');

frontend_file='frontend_data.pkl'
response = boto3.client('s3').get_object(Bucket=bucket, 
                                         Key=os.path.join(s3_master_dir, frontend_file))["Body"].read()
data = pickle.loads(response)

In [None]:
data.keys()

In [None]:
len(data['prediction'])

In [None]:
data_df = pd.DataFrame(data)

In [None]:
data_df.head()

In [None]:
#data_df.airline.value_counts()

In [None]:
np.shape(data['embedding'][0])

In [None]:
X = np.vstack(data_df['embedding'].values)

np.shape(X)

In [None]:
%%time 

embedding = umap.UMAP(n_components=2,
                      n_neighbors=15,
                      min_dist=0.1,
                      metric='cosine').fit_transform(X)

In [None]:
data_df['umap_x'], data_df['umap_y'] = embedding[:, 0], embedding[:, 1]

In [None]:
%%opts Scatter [width=500 height=500] (color='prediction')
hv.Scatter(data_df, kdims=['umap_x', 'umap_y'], vdims=['prediction'])

In [None]:
#data_df['tweet'] = [''.join(random.choices(string.ascii_uppercase + string.digits, k=10))
#                    for i in range(0, len(data_df))]

In [None]:
data_df.head()

In [None]:
data_df['tweet'] = data_df['tweet'].apply(lambda t: " ".join(t))

In [None]:
embedding_file='umap_embedding.joblib'
joblib.dump(data_df, os.path.join(data_dir,embedding_file), compress=True)

In [None]:
boto3.client('s3').put_object(Bucket=bucket, 
                              Key=os.path.join(s3_master_dir, embedding_file), 
                              Body=open(os.path.join(data_dir,embedding_file), 'rb'))