In [2]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3
import pandas as pd

sess = sagemaker.Session()

role = get_execution_role()
print(role) # This is the role that SageMaker would use to leverage AWS resources (S3, CloudWatch) on your behalf

bucket = 'roymark-aws-ml'
print(bucket)
prefix = 'comprehend/dbpedia' 
prefix_test = 'comprehend/dbpedia_test'

arn:aws:iam::355151823911:role/service-role/AmazonSageMaker-ExecutionRole-20180515T132694
roymark-aws-ml


In [2]:
!wget https://github.com/saurabh3949/Text-Classification-Datasets/raw/master/dbpedia_csv.tar.gz

--2019-04-04 19:26:33--  https://github.com/saurabh3949/Text-Classification-Datasets/raw/master/dbpedia_csv.tar.gz
Resolving github.com (github.com)... 192.30.253.112, 192.30.253.113
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/saurabh3949/Text-Classification-Datasets/master/dbpedia_csv.tar.gz [following]
--2019-04-04 19:26:33--  https://raw.githubusercontent.com/saurabh3949/Text-Classification-Datasets/master/dbpedia_csv.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.248.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.248.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 68431223 (65M) [application/octet-stream]
Saving to: ‘dbpedia_csv.tar.gz’


2019-04-04 19:26:34 (192 MB/s) - ‘dbpedia_csv.tar.gz’ saved [68431223/68431223]



In [3]:
!tar -xzvf dbpedia_csv.tar.gz

dbpedia_csv/
dbpedia_csv/test.csv
dbpedia_csv/classes.txt
dbpedia_csv/train.csv
dbpedia_csv/readme.txt


In [4]:
!head dbpedia_csv/train.csv -n 3

1,"E. D. Abbott Ltd"," Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972."
1,"Schwan-Stabilo"," Schwan-STABILO is a German maker of pens for writing colouring and cosmetics as well as markers and highlighters for office use. It is the world's largest manufacturer of highlighter pens Stabilo Boss."
1,"Q-workshop"," Q-workshop is a Polish company located in Poznań that specializes in designand production of polyhedral dice and dice accessories for use in various games (role-playing gamesboard games and tabletop wargames). They also run an online retail store and maintainan active forum community.Q-workshop was established in 2001 by Patryk Strzelewicz – a student from Poznań. Initiallythe company sold its products via online auction services but in 2005 a website and online store wereestablis

In [5]:
!cat dbpedia_csv/classes.txt

Company
EducationalInstitution
Artist
Athlete
OfficeHolder
MeanOfTransportation
Building
NaturalPlace
Village
Animal
Plant
Album
Film
WrittenWork


In [6]:
index_to_label = {} 
with open("dbpedia_csv/classes.txt") as f:
    for i,label in enumerate(f.readlines()):
        index_to_label[str(i+1)] = label.strip()
print(index_to_label)

{'1': 'Company', '2': 'EducationalInstitution', '3': 'Artist', '4': 'Athlete', '5': 'OfficeHolder', '6': 'MeanOfTransportation', '7': 'Building', '8': 'NaturalPlace', '9': 'Village', '10': 'Animal', '11': 'Plant', '12': 'Album', '13': 'Film', '14': 'WrittenWork'}


In [10]:
import pandas as pd
db_df = pd.read_csv('dbpedia_csv/train.csv', sep=',', header=None)

In [11]:
db_df.head()

Unnamed: 0,0,1,2
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [12]:
db_df.columns=['class_id', 'title', 'abstract']

In [13]:
db_df.head()

Unnamed: 0,class_id,title,abstract
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...


In [17]:
db_df.drop(['title'], axis=1, inplace=True)

In [18]:
db_df.head()

Unnamed: 0,class_id,abstract
0,1,Abbott of Farnham E D Abbott Limited was a Br...
1,1,Schwan-STABILO is a German maker of pens for ...
2,1,Q-workshop is a Polish company located in Poz...
3,1,Marvell Software Solutions Israel known as RA...
4,1,Bergan Mercy Medical Center is a hospital loc...


In [21]:
db_df['class_id'] = db_df['class_id'].apply(str)
db_df.replace({'class_id': index_to_label}, inplace=True)
db_df.head()

Unnamed: 0,class_id,abstract
0,Company,Abbott of Farnham E D Abbott Limited was a Br...
1,Company,Schwan-STABILO is a German maker of pens for ...
2,Company,Q-workshop is a Polish company located in Poz...
3,Company,Marvell Software Solutions Israel known as RA...
4,Company,Bergan Mercy Medical Center is a hospital loc...


In [25]:
db_df.to_csv('dbpedia_comprehend_input.csv', sep=',', header=None, index=False)

In [26]:
db_df.shape

(560000, 2)

In [27]:
!head dbpedia_comprehend_input.csv -n 3

Company, Abbott of Farnham E D Abbott Limited was a British coachbuilding business based in Farnham Surrey trading under that name from 1929. A major part of their output was under sub-contract to motor vehicle manufacturers. Their business closed in 1972.
Company, Schwan-STABILO is a German maker of pens for writing colouring and cosmetics as well as markers and highlighters for office use. It is the world's largest manufacturer of highlighter pens Stabilo Boss.
Company, Q-workshop is a Polish company located in Poznań that specializes in designand production of polyhedral dice and dice accessories for use in various games (role-playing gamesboard games and tabletop wargames). They also run an online retail store and maintainan active forum community.Q-workshop was established in 2001 by Patryk Strzelewicz – a student from Poznań. Initiallythe company sold its products via online auction services but in 2005 a website and online store wereestablished.


In [28]:
sess.upload_data(path='dbpedia_comprehend_input.csv', bucket=bucket, key_prefix=prefix)

's3://roymark-aws-ml/comprehend/dbpedia/dbpedia_comprehend_input.csv'

In [1]:
sentences = ['Convair was an american aircraft manufacturing company which later expanded into rockets and spacecraft.',
            'Berwick secondary college is situated in the outer melbourne metropolitan suburb of berwick .',
            'Kevin Durant is one of the best NBA players. He has multiple championship rings and was voted MVP. He is 6 foot 11 inches and can shoot 3 pointers effectively.',
            'This novel is about a family in Ireland that deals with potato famine.',
            'This stars Al Pacino and is about a corrupt police officer in New York City.']

In [3]:
test_df = pd.DataFrame(sentences)
test_df.head()

Unnamed: 0,0
0,Convair was an american aircraft manufacturing...
1,Berwick secondary college is situated in the o...
2,Kevin Durant is one of the best NBA players. H...
3,This novel is about a family in Ireland that d...
4,This stars Al Pacino and is about a corrupt po...


In [4]:
test_df.to_csv('dbpedia_comprehend_test_set.csv', header=None, index=False)

In [5]:
sess.upload_data(path='dbpedia_comprehend_test_set.csv', bucket=bucket, key_prefix=prefix_test)

's3://roymark-aws-ml/comprehend/dbpedia_test/dbpedia_comprehend_test_set.csv'

In [8]:
!head dbpedia_comprehend_test_set.csv -n 5

Convair was an american aircraft manufacturing company which later expanded into rockets and spacecraft.
Berwick secondary college is situated in the outer melbourne metropolitan suburb of berwick .
Kevin Durant is one of the best NBA players. He has multiple championship rings and was voted MVP. He is 6 foot 11 inches and can shoot 3 pointers effectively.
This novel is about a family in Ireland that deals with potato famine.
This stars Al Pacino and is about a corrupt police officer in New York City.


In [38]:
# should come back as 1 (company), 2 (educational inst), 6 (transp),
# (book), (film) for classification respectively