## Using K-Means Clustering (Unsupervised Learning)

In [1]:
# for Python 2: use print only as a function
from __future__ import print_function

In [2]:
# read file into pandas using a relative path
import pandas as pd
path = 'data/card-data.csv'
card = pd.read_csv(path, sep='|', encoding='ISO-8859-1', header=None, names=['desc'])

In [3]:
card.head()

Unnamed: 0,desc
0,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA
1,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA
2,COSTCO WHSE #0626 PRINCE WILLIAVA
3,CHEVRON XXXXXXX DICKINSON TX
4,WEGMANS #007 STERLING VA


In [4]:
card.shape

(9999, 1)

In [5]:
# how to define X and y for use with COUNTVECTORIZER
X = card.desc
print(X.shape, type(X))

(9999,) <class 'pandas.core.series.Series'>


In [6]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [7]:
# learn training data vocabulary, then use it to create a document-term matrix
X_dtm = vect.fit_transform(X)
X_dtm

<9999x10091 sparse matrix of type '<type 'numpy.int64'>'
	with 49182 stored elements in Compressed Sparse Row format>

In [8]:
# store the vocabulary of X
X_tokens = vect.get_feature_names()
len(X_tokens)

10091

In [9]:
# examine the first 50 tokens
print(X_tokens[0:50])

[u'00', u'000', u'0000', u'0000ashburn', u'0000milledgevillega', u'0000santa', u'0001', u'0003', u'0003key', u'0006', u'0007', u'0009', u'000andrews', u'000jacksonville', u'000xxx', u'0011', u'0014', u'0016', u'0018', u'0019', u'002', u'0020', u'0022', u'0023', u'0029', u'003', u'0033', u'0034', u'0035', u'0036ellicott', u'0039', u'004', u'0045', u'0047', u'0048', u'005', u'0051', u'0053', u'0057', u'006', u'0063', u'0067', u'0068', u'007', u'0070', u'0071', u'0073', u'0075', u'0077', u'0078']


In [10]:
# examine the last 50 tokens
print(X_tokens[-50:])

[u'yorknj', u'yorktown', u'yorkville', u'yoshinoya', u'you', u'young', u'youngstown', u'youpayment', u'your', u'youreference', u'youth', u'yp', u'ysidro', u'yucaipa', u'yukon', u'yume', u'yunibasu', u'yuofz', u'yz762', u'z102ke7l5rwz102ke7l5rw1', u'z3xnw', u'zagg', u'zaika', u'zap', u'zappos', u'zara', u'zaxby', u'zen', u'zephyrhills', u'zero', u'zerorez', u'zh', u'zhouxiaoyan', u'zingerman', u'zio', u'zipcar', u'zmhgk', u'zoes', u'zone', u'zoo', u'zoomawayinc', u'zpass', u'zumiez', u'zupas', u'zurich', u'zushi', u'zva866', u'zynga', u'zzz7tx4vvvv', u'\xfdj']


In [11]:
print(filter(lambda x: 'amazon' in x, X_tokens))

[u'amazon', u'amazonprime', u'tipamazon', u'usamazon', u'waamazon', u'waamazonprime']


In [12]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,00,000,0000,0000ashburn,0000milledgevillega,0000santa,0001,0003,0003key,0006,...,zoomawayinc,zpass,zumiez,zupas,zurich,zushi,zva866,zynga,zzz7tx4vvvv,ýj
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# K-means with n clusters
from sklearn.cluster import KMeans
km = KMeans(n_clusters=100, random_state=1)
km.fit(X_dtm)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=100, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001,
    verbose=0)

In [14]:
km.labels_

array([19, 58, 94, ..., 74, 44, 16], dtype=int32)

In [15]:
card['cluster'] = km.labels_

In [16]:
card[card.desc.str.contains('[Uu][Bb][Ee][Rr]')]

Unnamed: 0,desc,cluster
1,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,58
34,UBER *US JUN29 6SX63,69
42,UBER US JUN30 FQLEV HELP.UBER.COMCA,21
202,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,58
502,UBER US JUL04 3XCA7 XXXXXXXXXX CA,69
613,UBER US JUL02 6T237 XXXXXXXXXX CA,69
623,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,58
722,UBER US JUL02 6GG45 XXXXXXXXXX CA,69
736,UBER US JUL02 YZ762 HELP.UBER.COMCA,21
825,UBER US JUL01 EWJ56 XXXXXXXXXX CA,69


## Limited dataset 

In [17]:
import pandas as pd
card = pd.read_table('data/card-class.tsv', encoding='ISO-8859-1', header=None, names=['desc', 'merchant'])

In [19]:
card.head()

Unnamed: 0,desc,merchant
0,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON
1,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,UBER
2,Amazon.com AMZN.COM/BILLWA,AMAZON
3,AmazonPrime Membership amzn.com/prmeWA,AMAZON
4,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON


In [20]:
card.shape

(405, 2)

In [21]:
X = card.desc
X.shape

(405,)

In [22]:
# learn training data vocabulary, then use it to create a document-term matrix
X_dtm = vect.fit_transform(X)
X_dtm

<405x246 sparse matrix of type '<type 'numpy.int64'>'
	with 2194 stored elements in Compressed Sparse Row format>

In [23]:
# store the vocabulary of X
X_tokens = vect.get_feature_names()
len(X_tokens)

246

In [24]:
# examine the first 50 tokens
print(X_tokens[0:50])

[u'103', u'34mbl', u'3xca7', u'423', u'46', u'4jb3h', u'4th6x', u'4uk866', u'4vp3z', u'576', u'5lrel', u'5slob', u'6gg45', u'6k346', u'6oisv', u'6sx63', u'6t237', u'6try4', u'706', u'7b47s', u'7tsud', u'8381', u'866', u'86xxx', u'97', u'a18t8t7s', u'ab3gy', u'albuquerque', u'alexandria', u'alon', u'am', u'amazon', u'amazonprime', u'amxxx', u'amzn', u'angeles', u'arlington', u'auburn', u'austin', u'auth', u'aws', u'awsla', u'awtts', u'ax6z4', u'b7vse', u'bayside', u'bd2hx', u'beacva', u'berlin', u'bggev']


In [25]:
# K-means with n clusters
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3, random_state=1)
km.fit(X_dtm)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001,
    verbose=0)

In [26]:
km.labels_

array([0, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 1, 1, 1, 0, 0, 0, 1, 0, 2, 1,
       0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 2, 1, 0, 0,
       0, 2, 0, 1, 1, 2, 0, 1, 0, 1, 0, 0, 2, 1, 0, 2, 1, 0, 1, 2, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       2, 0, 1, 0, 0, 2, 0, 0, 0, 0, 2, 0, 1, 1, 0, 0, 1, 0, 0, 2, 0, 0, 1,
       1, 0, 0, 1, 0, 2, 1, 0, 0, 0, 0, 2, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 2, 0, 0, 1, 2, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 2,
       0, 1, 0, 0, 1, 0, 0, 1, 2, 0, 2, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 2, 2, 0, 0, 2, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 2,
       2, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0,
       0, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 0, 1, 0, 2, 0, 1, 0, 1,
       2, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 2, 2, 0, 1, 1, 1, 0, 2, 0, 1, 2, 0,
       0, 0,

In [27]:
card['cluster'] = km.labels_

In [29]:
card

Unnamed: 0,desc,merchant,cluster
0,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON,0
1,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,UBER,2
2,Amazon.com AMZN.COM/BILLWA,AMAZON,0
3,AmazonPrime Membership amzn.com/prmeWA,AMAZON,0
4,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON,0
5,UBER *US JUN29 6SX63,UBER,1
6,UBER US JUN30 FQLEV HELP.UBER.COMCA,UBER,1
7,AMAZON.COM AMZN.COM/BI,AMAZON,0
8,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON,0
9,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON,0


## Using Naive Bayes Classification (Supervised Learning)


In [31]:
import pandas as pd
card = pd.read_table('data/card-class.tsv', encoding='ISO-8859-1', header=None, names=['desc', 'merchant'])
card.head()

Unnamed: 0,desc,merchant
0,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON
1,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,UBER
2,Amazon.com AMZN.COM/BILLWA,AMAZON
3,AmazonPrime Membership amzn.com/prmeWA,AMAZON
4,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON


In [32]:
# convert label to a numerical variable
card['merchant_num'] = card.merchant.map({'AMAZON':1, 'UBER':2, '7-ELEVEN':3})

In [35]:
# check that the conversion worked
card.head(10)

Unnamed: 0,desc,merchant,merchant_num
0,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON,1
1,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,UBER,2
2,Amazon.com AMZN.COM/BILLWA,AMAZON,1
3,AmazonPrime Membership amzn.com/prmeWA,AMAZON,1
4,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON,1
5,UBER *US JUN29 6SX63,UBER,2
6,UBER US JUN30 FQLEV HELP.UBER.COMCA,UBER,2
7,AMAZON.COM AMZN.COM/BI,AMAZON,1
8,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON,1
9,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA,AMAZON,1


In [43]:
# required way to define X and y for use with COUNTVECTORIZER
X = card.desc
y = card.merchant_num
print(X.shape)
print(y.shape)

(405,)
(405,)


In [44]:
# split X and y into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)

(303,)
(102,)


In [45]:
# instantiate the vectorizer
vect = CountVectorizer()

# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.fit_transform(X_train)

# examine the document-term matrix
X_train_dtm

<303x208 sparse matrix of type '<type 'numpy.int64'>'
	with 1663 stored elements in Compressed Sparse Row format>

In [52]:
# store the vocabulary of X
X_tokens = vect.get_feature_names()
X_tokens[1:10]

[u'3xca7',
 u'423',
 u'46',
 u'4jb3h',
 u'4th6x',
 u'4vp3z',
 u'5lrel',
 u'5slob',
 u'6gg45']

In [57]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

<102x208 sparse matrix of type '<type 'numpy.int64'>'
	with 491 stored elements in Compressed Sparse Row format>

In [58]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [59]:
# train the model using X_train_dtm
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [60]:
# make class predictions for X_test_dtm
y_pred = nb.predict(X_test_dtm)

In [62]:
# calculate accuracy of class predictions
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

1.0

In [64]:
# print the confusion matrix
metrics.confusion_matrix(y_test, y_pred)

array([[67,  0,  0],
       [ 0, 19,  0],
       [ 0,  0, 16]])