In [25]:
#!pip install sagemaker

In [26]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

In [27]:
sess = sagemaker.Session()
role = get_execution_role()

In [28]:
role

'arn:aws:iam::251110592681:role/service-role/AmazonSageMaker-ExecutionRole-20191217T163918'

In [29]:
bucket = "the-pollsters"
prefix = "pollsters"

In [30]:
with open('dems.txt','r') as file:
    dem_txt = ["__label__0 " + line.strip('\n') for line in file]

In [31]:
with open('gop.txt','r') as file:
    gop_txt = ["__label__1 " + line.strip('\n') for line in file]

In [32]:
corpus = dem_txt + gop_txt

In [33]:
len(corpus)

38351

In [34]:
from sklearn.model_selection import train_test_split
corpus_train, corpus_test = train_test_split(corpus,test_size = 0.3, random_state = 45)

corpus_train_text = '\n'.join(corpus_train)
corpus_test_text = '\n'.join(corpus_test)

with open('tweets.train','w') as file:
    file.write(corpus_train_text)

with open('tweets.test', 'w') as file:
    file.write(corpus_test_text)

In [60]:
train_path = 'train'
test_path =  'test'

In [61]:
#sess.upload_data(path = 'tweets.train', bucket = bucket, key_prefix = 'train_path' )
#sess.upload_data(path = 'tweets.test', bucket = bucket, key_prefix = 'test_path')


In [62]:
s3_train_data = 's3://{}/{}'.format(bucket,train_path)
s3_test_data = 's3://{}/{}'.format(bucket,test_path)

In [63]:
container = sagemaker.amazon.amazon_estimator.get_image_uri('us-east-1','blazingtext','latest')

In [64]:
s3_output_location = 's3://{}/{}/output'.format(bucket,prefix)

In [65]:
bt_model = sagemaker.estimator.Estimator(container,role
                                         ,train_instance_count=1
                                         ,train_instance_type='ml.m4.xlarge'
                                         ,train_volume_size=5
                                         ,train_max_run=360000
                                         ,input_mode='File'
                                         ,output_path=s3_output_location
                                         ,sagemaker_session = sess
                                         )

In [66]:
bt_model.set_hyperparameters(mode = 'supervised', epochs=10, min_count=3, learning_rate = 0.05, vector_dim = 10, early_stopping=False, patientce=5, min_epochs =5, word_ngrams=2)
train_data = sagemaker.session.s3_input(s3_train_data,distribution='FullyReplicated', content_type = 'text/plain', s3_data_type = 'S3Prefix')
validation_data = sagemaker.session.s3_input(s3_test_data,distribution='FullyReplicated', content_type='text/plain',s3_data_type = 'S3Prefix')
data_channels = {'train':train_data, 'validation': validation_data}

In [67]:
bt_model.fit(inputs=data_channels,logs= True)

2020-04-09 19:58:42 Starting - Starting the training job...
2020-04-09 19:58:43 Starting - Launching requested ML instances......
2020-04-09 19:59:46 Starting - Preparing the instances for training......
2020-04-09 20:01:08 Downloading - Downloading input data
2020-04-09 20:01:08 Training - Downloading the training image...
2020-04-09 20:01:38 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[04/09/2020 20:01:39 INFO 140555001694016] nvidia-smi took: 0.0251741409302 secs to identify 0 gpus[0m
[34m[04/09/2020 20:01:39 INFO 140555001694016] Running single machine CPU BlazingText training using supervised mode.[0m
[34m[04/09/2020 20:01:39 INFO 140555001694016] Processing /opt/ml/input/data/train/train_path/tweets.train . File size: 4 MB[0m
[34m[04/09/2020 20:01:39 INFO 140555001694016] Processing /opt/ml/input/data/validation/test_path/tweets.test . File size: 2 MB[0m
[34mRead 0M words[0m
[34mNumber of words:  17192[0m
[34m####

In [69]:
predictor = bt_model.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

-------------!

In [70]:
predictor

<sagemaker.predictor.RealTimePredictor at 0x7feb92f120b8>

In [72]:
corpus_test_no_labels= [x[11:] for x in corpus_test]
payload={'instances':corpus_test_no_labels}
response = predictor.predict(json.dumps(payload))
predictions = json.loads(response)
#print(json.dumps(predictions,indent=2))

In [73]:
predicted_labels = [prediction['label'][0] for prediction in predictions]

In [74]:
predicted_labels[:4]

['__label__0', '__label__1', '__label__1', '__label__1']

In [75]:
actual_labels = [x[:10] for x in corpus_test]
actual_labels[:4]

['__label__0', '__label__1', '__label__1', '__label__0']

In [76]:
matches = [(actual_label == predicted_label) for (actual_label, predicted_label) in zip(actual_labels,predicted_labels)]

In [77]:
matches.count(True) / len(matches)

0.9135233791065531