In [1]:
%config IPCompleted.greedy = True

## Import Libraries

In [2]:
import os
import re
import nltk
import numpy as np
from azureml.core import Workspace, Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer 

In [3]:
# download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\luisf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Get Workspace

In [4]:
ws = Workspace.from_config()
ws.get_details()

{'id': '/subscriptions/1f3811a0-1fb9-4d43-974c-8c7bdc025d92/resourceGroups/disasterLocator/providers/Microsoft.MachineLearningServices/workspaces/disasterLocatorML',
 'name': 'disasterLocatorML',
 'location': 'eastus',
 'type': 'Microsoft.MachineLearningServices/workspaces',
 'tags': {},
 'sku': 'Basic',
 'workspaceid': '90beb71e-3575-4cbd-af2a-cb2c303edc7b',
 'description': '',
 'friendlyName': 'disasterLocatorML',
 'creationTime': '2020-11-11T14:39:30.8601986+00:00',
 'keyVault': '/subscriptions/1f3811a0-1fb9-4d43-974c-8c7bdc025d92/resourcegroups/disasterlocator/providers/microsoft.keyvault/vaults/disasterlocato9806975694',
 'applicationInsights': '/subscriptions/1f3811a0-1fb9-4d43-974c-8c7bdc025d92/resourcegroups/disasterlocator/providers/microsoft.insights/components/disasterlocato5776021600',
 'identityPrincipalId': 'ffd7eac5-c771-422b-8080-675c5d58b978',
 'identityTenantId': 'b2cc71a7-45cf-49fe-b78c-a7d1913afbfb',
 'identityType': 'SystemAssigned',
 'storageAccount': '/subscripti

# Get Key Vault Secrets

In [5]:
keyvault = ws.get_default_keyvault()
subscription_id = keyvault.get_secret("subscription-id")

# Attach Compute Target

In [6]:
# compute cluster configuration
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "nlp-cpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target: ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)
    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target: nlp-cpu-cluster


# Consume Dataset

In [7]:
dataset = Dataset.get_by_name(ws, name='disaster_tweets_train')
tweets_pd = dataset.to_pandas_dataframe()
tweets_pd.head()

Unnamed: 0,id,keyword,location,text,target
0,1.0,,,Our Deeds are the Reason of this #earthquake M...,1.0
1,4.0,,,Forest fire near La Ronge Sask. Canada,1.0
2,5.0,,,All residents asked to 'shelter in place' are ...,1.0
3,6.0,,,"13,000 people receive #wildfires evacuation or...",1.0
4,7.0,,,Just got sent this photo from Ruby #Alaska as ...,1.0


In [8]:
tweets_pd.drop(columns=['id', 'keyword', 'location'], inplace=True)
tweets_pd.dropna(inplace=True)
tweets_pd.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1.0
1,Forest fire near La Ronge Sask. Canada,1.0
2,All residents asked to 'shelter in place' are ...,1.0
3,"13,000 people receive #wildfires evacuation or...",1.0
4,Just got sent this photo from Ruby #Alaska as ...,1.0


In [9]:
tweets_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7176 entries, 0 to 8392
Data columns (total 2 columns):
text      7176 non-null object
target    7176 non-null float64
dtypes: float64(1), object(1)
memory usage: 168.2+ KB


In [10]:
tweets_pd.describe()

Unnamed: 0,target
count,7176.0
mean,0.429348
std,0.495018
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


# Split Traning Data

In [11]:
x = tweets_pd['text'].values
y = tweets_pd['target'].values
print("x sample: ", x[0])
print("y sample: ", y[0])

x sample:  Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
y sample:  1.0


In [12]:
stemmer = PorterStemmer() 
stopwords_english = stopwords.words('english') 

def process_tweet(text):
    # remove special characters
    text = re.sub('[^A-Za-z0-9 ]+', '', text)
    # remove stopwords
    cleaned_tweet = []
    for word in text.split():
        stem_word = stemmer.stem(word)
        if stem_word not in stopwords_english:
            cleaned_tweet.append(stem_word)
    return cleaned_tweet

In [13]:
X_train = [process_tweet(tweet) for tweet in x]

In [14]:
X_train[0]

['deed', 'reason', 'thi', 'earthquak', 'may', 'allah', 'forgiv', 'us']

# Building the Vocabulary

In [15]:
def build_vocab(tweets_array):
    vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 
    for tweet in tweets_array:
        for word in tweet:
            if word not in vocab:
                vocab[word] = len(vocab)
    return vocab

In [16]:
vocab = build_vocab(X_train)
vocab

{'__PAD__': 0,
 '__</e>__': 1,
 '__UNK__': 2,
 'deed': 3,
 'reason': 4,
 'thi': 5,
 'earthquak': 6,
 'may': 7,
 'allah': 8,
 'forgiv': 9,
 'us': 10,
 'forest': 11,
 'fire': 12,
 'near': 13,
 'La': 14,
 'rong': 15,
 'sask': 16,
 'canada': 17,
 'resid': 18,
 'ask': 19,
 'shelter': 20,
 'place': 21,
 'notifi': 22,
 'offic': 23,
 'No': 24,
 'evacu': 25,
 'order': 26,
 'expect': 27,
 '13000': 28,
 'peopl': 29,
 'receiv': 30,
 'wildfir': 31,
 'california': 32,
 'got': 33,
 'sent': 34,
 'photo': 35,
 'rubi': 36,
 'alaska': 37,
 'smoke': 38,
 'pour': 39,
 'school': 40,
 'rockyfir': 41,
 'updat': 42,
 'hwi': 43,
 '20': 44,
 'close': 45,
 'direct': 46,
 'due': 47,
 'lake': 48,
 'counti': 49,
 'cafir': 50,
 'flood': 51,
 'disast': 52,
 'heavi': 53,
 'rain': 54,
 'caus': 55,
 'flash': 56,
 'street': 57,
 'manit': 58,
 'colorado': 59,
 'spring': 60,
 'area': 61,
 'Im': 62,
 'top': 63,
 'hill': 64,
 'I': 65,
 'see': 66,
 'wood': 67,
 'emerg': 68,
 'happen': 69,
 'build': 70,
 'across': 71,
 'afraid'

# Transform Tweets 

In [17]:
def transform_tweets(sentence_tweets, vocab, unk_tag='__UNK__'):
    transformed_tweets = []
    for tweet in sentence_tweets:
        processed_tweet = []
        for word in tweet:
            if word in vocab:
                processed_tweet.append(vocab[word])
            else:
                processed_tweet.append(vocab[unk_tag])
        transformed_tweets.append(np.array(processed_tweet, dtype=np.int64))
    return transformed_tweets

In [18]:
X_train = transform_tweets(X_train, vocab, unk_tag='__UNK__')
X_train

[array([ 3,  4,  5,  6,  7,  8,  9, 10], dtype=int64),
 array([11, 12, 13, 14, 15, 16, 17], dtype=int64),
 array([18, 19, 20, 21, 22, 23, 24, 25, 20, 21, 26, 27], dtype=int64),
 array([28, 29, 30, 31, 25, 26, 32], dtype=int64),
 array([33, 34,  5, 35, 36, 37, 38, 31, 39, 40], dtype=int64),
 array([41, 42, 32, 43, 44, 45, 46, 47, 48, 49, 12, 50, 31], dtype=int64),
 array([51, 52, 53, 54, 55, 56, 51, 57, 58, 59, 60, 61], dtype=int64),
 array([62, 63, 64, 65, 66, 12, 67], dtype=int64),
 array([68, 25, 69, 70, 71, 57], dtype=int64),
 array([62, 72, 73, 74, 61], dtype=int64),
 array([75, 29, 76, 77, 78, 79], dtype=int64),
 array([80, 81, 82, 83, 51, 84, 85, 86, 87, 65, 88, 89, 81, 82, 90, 65, 91,
        92, 90, 65, 91, 92, 93, 51], dtype=int64),
 array([ 54,  51,  94,  95,  82,  96,  97,  98,  99, 100, 101], dtype=int64),
 array([ 51, 102, 103, 104, 105, 102], dtype=int64),
 array([106,  40, 107, 108, 109, 110, 111, 112], dtype=int64),
 array([113], dtype=int64),
 array([ 65, 114, 115], dt

In [19]:
max_len = max([len(tweet) for tweet in X_train])+1 #+1 for end of sentence tag 
max_len

27

In [20]:
def pad_tweets(all_tweets, vocab, max_len, end_tag='__</e>__', pad_tag='__PAD__'):
    padded_sequences = []
    for tweet in all_tweets:
        padded_tweet = np.array(list(tweet) + [vocab[end_tag]] + [vocab[pad_tag]]*(max_len-len(tweet)-1))
        if (len(padded_tweet) == max_len):
            padded_sequences.append(padded_tweet)
    return np.array(padded_sequences)

In [21]:
X_train = pad_tweets(X_train, vocab, max_len, end_tag='__</e>__', pad_tag='__PAD__')

In [22]:
X_train.shape

(7176, 27)

In [23]:
len(X_train[0])==max_len

True