In [1]:
# for Python 2: use print only as a function
from __future__ import print_function

In [100]:
# read file into pandas using a relative path
import pandas as pd
path = 'data/card-data.csv'
card = pd.read_csv(path, sep='|', encoding='ISO-8859-1', header=None, names=['desc'])

In [101]:
card.head()

Unnamed: 0,desc
0,AMAZON MKTPLACE PMTS AMZN.COM/BILLWA
1,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA
2,COSTCO WHSE #0626 PRINCE WILLIAVA
3,CHEVRON XXXXXXX DICKINSON TX
4,WEGMANS #007 STERLING VA


In [102]:
card.shape

(9999, 1)

In [104]:
# how to define X and y for use with COUNTVECTORIZER
X = card.desc
print(X.shape, type(X))

(9999,) <class 'pandas.core.series.Series'>


In [65]:
# import and instantiate CountVectorizer (with the default parameters)
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [67]:
# learn training data vocabulary, then use it to create a document-term matrix
X_dtm = vect.fit_transform(X)
X_dtm

<9999x10091 sparse matrix of type '<type 'numpy.int64'>'
	with 49182 stored elements in Compressed Sparse Row format>

In [79]:
# store the vocabulary of X
X_tokens = vect.get_feature_names()
len(X_tokens)

10091

In [80]:
# examine the first 50 tokens
print(X_tokens[0:50])

[u'00', u'000', u'0000', u'0000ashburn', u'0000milledgevillega', u'0000santa', u'0001', u'0003', u'0003key', u'0006', u'0007', u'0009', u'000andrews', u'000jacksonville', u'000xxx', u'0011', u'0014', u'0016', u'0018', u'0019', u'002', u'0020', u'0022', u'0023', u'0029', u'003', u'0033', u'0034', u'0035', u'0036ellicott', u'0039', u'004', u'0045', u'0047', u'0048', u'005', u'0051', u'0053', u'0057', u'006', u'0063', u'0067', u'0068', u'007', u'0070', u'0071', u'0073', u'0075', u'0077', u'0078']


In [83]:
# examine the last 50 tokens
print(X_tokens[-50:])

[u'yorknj', u'yorktown', u'yorkville', u'yoshinoya', u'you', u'young', u'youngstown', u'youpayment', u'your', u'youreference', u'youth', u'yp', u'ysidro', u'yucaipa', u'yukon', u'yume', u'yunibasu', u'yuofz', u'yz762', u'z102ke7l5rwz102ke7l5rw1', u'z3xnw', u'zagg', u'zaika', u'zap', u'zappos', u'zara', u'zaxby', u'zen', u'zephyrhills', u'zero', u'zerorez', u'zh', u'zhouxiaoyan', u'zingerman', u'zio', u'zipcar', u'zmhgk', u'zoes', u'zone', u'zoo', u'zoomawayinc', u'zpass', u'zumiez', u'zupas', u'zurich', u'zushi', u'zva866', u'zynga', u'zzz7tx4vvvv', u'\xfdj']


In [156]:
print(filter(lambda x: 'amazon' in x, X_tokens))

[u'amazon', u'amazonprime', u'tipamazon', u'usamazon', u'waamazon', u'waamazonprime']


In [64]:
# examine the vocabulary and document-term matrix together
pd.DataFrame(X_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,00,000,0000,0000ashburn,0000milledgevillega,0000santa,0001,0003,0003key,0006,...,zoomawayinc,zpass,zumiez,zupas,zurich,zushi,zva866,zynga,zzz7tx4vvvv,ýj
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [181]:
# K-means with n clusters
from sklearn.cluster import KMeans
km = KMeans(n_clusters=100, random_state=1)
km.fit(X_dtm)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=100, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=1, tol=0.0001,
    verbose=0)

In [182]:
km.labels_

array([19, 58, 94, ..., 74, 44, 16], dtype=int32)

In [183]:
card['cluster'] = km.labels_

In [184]:
card[card.desc.str.contains('[Uu][Bb][Ee][Rr]')]

Unnamed: 0,desc,cluster
1,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,58
34,UBER *US JUN29 6SX63,69
42,UBER US JUN30 FQLEV HELP.UBER.COMCA,21
202,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,58
502,UBER US JUL04 3XCA7 XXXXXXXXXX CA,69
613,UBER US JUL02 6T237 XXXXXXXXXX CA,69
623,UBER TECHNOLOGIES INC XXX-XXX-XXXX CA,58
722,UBER US JUL02 6GG45 XXXXXXXXXX CA,69
736,UBER US JUL02 YZ762 HELP.UBER.COMCA,21
825,UBER US JUL01 EWJ56 XXXXXXXXXX CA,69
