In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


import util
from util import df_with_selected_features, split_df, split_df_to_x_y, train_model_func
# from sklearn.ensemble import RandomForestClassifier
import spacy

import sagemaker
pd.set_option('display.max_colwidth', 160)

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = "nafis-sagemaker"
prefix = "datasets"
region_name ="us-west-2"

In [1]:
nlp = spacy.load('en_core_web_md')

msg = util.read_df('data/messages.csv')

# Add length of messages before cleaning
msg['InBoundLength'] = msg['InBound'].apply(len)

#vectors are already created
df = df_with_selected_features(df=msg, col_to_vec='InBound',nlp = nlp, target=['TemplateID'],
                               cat_feats='ConversationType',
                               cat_feats_name=['cart', 'live_text', 'campaigns', 'platform', 'optin_disc',
                                               'optin_conf'],
                               num_feats=['InBoundLength', 'ConversationLength'], 
                                file_path=None,
                               create_vec=False, file_path_vec='out/vectors_modifystopwords.csv', save=True)


print(df.shape)
df.head()

  if self.run_code(code, result):


209.43013882637024


In [8]:
# Split Data and remove duplicate rows
train, test = split_df(df, test_size=0.2, random_state=42, drop_duplicate_train=True)

X_train, y_train = split_df_to_x_y(train, target='TemplateID', specify_features=None)
X_test, y_test = split_df_to_x_y(test, target='TemplateID', specify_features=None)

---
# Train models

## XGboost


In [15]:
# from sklearn.ensemble import GradientBoostingClassifier


# # Xgboost:
# xg_clf = GradientBoostingClassifier(random_state=42)
# param_grid_rf = [{'n_estimators': [100], 'learning_rate': [0.1]}]

# xg_model = train_model_func(X=X_train, y=np.ravel(y_train),
#                             estimator=xg_clf, param_grid=param_grid_rf, cv=5, scoring="neg_log_loss")

# xg_metrics, xg_report = util.metrics(xg_model, X_test, y_test)

In [12]:
test['TemplateID'].nunique()

23

In [16]:
df.head(2)

Unnamed: 0,TemplateID,InBound_length,ConversationLength,0,1,2,3,4,5,6,...,296,297,298,299,cart,live_text,campaigns,platform,optin_disc,optin_conf
0,1150.0,19,9.0,0.32993,-0.27363,-0.18124,-0.51183,0.53935,0.15882,0.006595,...,-0.19233,0.17293,0.12617,0.59792,1,0,0,0,0,0
1,321.0,110,27.0,-0.019328,0.156222,-0.382688,0.198034,-0.093976,0.051917,-0.14402,...,0.126053,-0.1025,-0.048436,0.144246,1,0,0,0,0,0
2,365.0,34,2.0,-0.332885,-0.019312,-0.252119,-0.129854,-0.133659,0.068219,0.043203,...,-0.137383,0.050422,-0.051024,0.220546,0,0,1,0,0,0
3,1150.0,135,3.0,-0.05888,0.021511,-0.126566,-0.093623,-0.030444,0.06487,0.079254,...,-0.115857,0.117376,0.031562,0.105599,0,1,0,0,0,0
4,6650.0,68,2.0,-0.20412,0.24711,-0.245464,-0.146155,-0.070671,-0.044716,0.120357,...,-0.106557,-0.049423,0.012471,0.097085,0,0,1,0,0,0


## Sagemaker XGboost

In [20]:
dataset = df.copy()
other_features = df.drop('TemplateID' , axis = 1)
classes = df[['TemplateID']]

classes = classes.drop_duplicates(subset='TemplateID').reset_index()
classes = classes[['TemplateID']]
classes['LABEL_OUT'] = classes.index

dataset = dataset.join(classes.set_index('TemplateID'), on='TemplateID', rsuffix='__label__').reset_index()
dataset = dataset[['LABEL_OUT']]
dataset = pd.concat([dataset, other_features], axis=1)

train, test = split_df(dataset, test_size=0.2, random_state=42, drop_duplicate_train=True)


train.head(2)

Unnamed: 0,LABEL_OUT,InBound_length,ConversationLength,0,1,2,3,4,5,6,...,296,297,298,299,cart,live_text,campaigns,platform,optin_disc,optin_conf
63231,1,6,5.0,-0.2352,0.247,-0.35703,0.048294,-0.42617,0.40876,0.21463,...,0.32283,-0.11009,-0.56499,0.47109,1,0,0,0,0,0
204231,1,12,6.0,-0.290245,0.090233,-0.435185,0.005746,-0.319095,0.304225,0.016425,...,0.156875,-0.150045,-0.257073,0.275747,1,0,0,0,0,0
9941,1,10,5.0,-0.26158,0.110378,-0.521967,-0.326108,-0.113388,0.121047,0.189598,...,0.040974,0.018056,-0.185767,0.392143,1,0,0,0,0,0
36931,1,49,5.0,-0.191801,0.156824,-0.194281,-0.225072,-0.062128,0.127113,0.063202,...,0.069561,-0.133332,-0.187938,0.325828,1,0,0,0,0,0
141348,1,29,2.0,-0.29793,0.234814,-0.464023,-0.101866,-0.127221,0.107389,0.181273,...,0.184845,-0.221204,-0.198965,0.204163,1,0,0,0,0,0


In [None]:
#source = pd.read_csv('s3://{}/{}/datasetneame.csv'.format(bucket, prefix), low_memory=False)

In [28]:
import boto3
import os

containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'}

train.to_csv('xgtrain.csv',header = False, index=False);
test.to_csv('xgtest.csv', header = False, index=False);

input_train = boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train/train.csv'))
input_train.upload_file('xgtrain.csv', ExtraArgs={"ContentType": "text/csv"})
input_test = boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation/validation.csv'))
input_test.upload_file('xgtest.csv', ExtraArgs={"ContentType": "text/csv"})



train_input = sagemaker.inputs.s3_input('s3://{}/{}/train/train.csv'.format(bucket,prefix), content_type='text/csv')
test_input = sagemaker.inputs.s3_input('s3://{}/{}/validation/validation.csv'.format(bucket,prefix), content_type='text/csv')                                    

In [29]:
xgb = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                    role, 
                                    train_instance_count=1,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=sess)


xgb.set_hyperparameters(eta=0.1, objective='multi:softmax', num_round=23, num_class=train['LABEL_OUT'].nunique())
xgb.fit({'train':train_input, 'validation':test_input})

2020-02-13 01:50:47 Starting - Starting the training job...
2020-02-13 01:50:48 Starting - Launching requested ML instances......
2020-02-13 01:51:49 Starting - Preparing the instances for training......
2020-02-13 01:53:00 Downloading - Downloading input data...
2020-02-13 01:53:43 Training - Downloading the training image...
2020-02-13 01:54:03 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2020-02-13:01:54:04:INFO] Running standalone xgboost training.[0m
[34m[2020-02-13:01:54:04:INFO] File size need to be processed in the node: 1034.13mb. Available memory size in the node: 8512.64mb[0m
[34m[2020-02-13:01:54:04:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:54:04] S3DistributionType set as FullyReplicated[0m
[34m[01:54:08] 125341x308 matrix with 38605028 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2020-02-13:01:54:08:INFO] Determined delimiter of CSV input is ','[0

In [30]:
predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m5.large')

-----------------!

In [34]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, predictor.predict(array).decode('utf-8')])
        
    return np.fromstring(predictions[1:], sep=',')

from sagemaker.predictor import csv_serializer

predictor.content_tpye = 'text/csv'
predictor.serializer = csv_serializer
predictor.deserializer = None


predictions = predict(test.drop('LABEL_OUT', axis=1).values)

In [87]:
#predictions = predict(t.values)
predictions = []
for i in range(test.shape[0]//3000 + 1):
    predicts = predict(test[i*3000:(i+1)*3000].drop('LABEL_OUT', axis=1).values)
    predictions.extend(predicts) 

In [89]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, cohen_kappa_score

y_test = test['LABEL_OUT'].values

print(classification_report(test['LABEL_OUT'].values, predictions))

cm = confusion_matrix(y_test, predictions)
print("Confusion Matrix:")
print()

df_cm = pd.DataFrame(cm, index=[i for i in range(len(cm))],
                     columns=[i for i in range(len(cm))])
plt.figure(figsize=(15, 15))
sns.heatmap(df_cm, annot=True, fmt='d', cmap='PuRd')
tick_marks = np.arange(len(cm))
plt.show()
print("-----" * 5)
report = classification_report(y_test, predictions, output_dict=True)
df_report = pd.DataFrame(report).transpose()
print(classification_report(y_test, predictions))

print("-----" * 5)

acc = accuracy_score(y_test, predictions)
print(f"Accuracy: {acc:.3f}")

kappa = cohen_kappa_score(y_test, predictions)
print(f"Kappa: {kappa:.3f}".format())

              precision    recall  f1-score   support

           0       0.51      0.20      0.29       495
           1       0.84      0.97      0.90     31635
           2       0.65      0.41      0.50       330
           3       0.60      0.43      0.50       513
           4       0.58      0.34      0.43       623
           5       0.63      0.58      0.60      1307
           6       0.45      0.30      0.36      1150
           7       0.90      0.89      0.89      6401
           8       0.23      0.01      0.02       297
           9       0.79      0.46      0.58      1234
          10       0.57      0.21      0.30       217
          11       0.31      0.08      0.13       436
          12       0.50      0.20      0.28       531
          13       0.43      0.03      0.06       187
          14       0.65      0.39      0.49       832
          15       0.55      0.42      0.48       505
          16       0.58      0.52      0.55       593
          17       0.72    