In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import DataFrame as df

import sklearn
from pandas import pivot_table, read_clipboard
import os, sys, traceback

from sklearn.linear_model import LinearRegression

In [2]:
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
instances = [
{'city': 'New York'},
{'city': 'San Francisco'},
{'city': 'Chapel Hill'}]
print onehot_encoder.fit_transform(instances).toarray()

[[ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]]


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
#corpus is a list of text documents
corpus = ['UNC played Duke in basketball', 'Duke lost the basketball game', 'I ate a sandwich']
#Tokenizes the text into strings with at least 2 character lengths and then counts them 
vectorizer = CountVectorizer()
dense_corp = vectorizer.fit_transform(corpus).todense()
print dense_corp
print vectorizer.vocabulary_
for i, val in enumerate(corpus):
    print val + " = " + str(dense_corp[i])

[[0 1 1 0 1 0 1 0 0 1]
 [0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0]]
{u'duke': 2, u'basketball': 1, u'lost': 5, u'played': 6, u'in': 4, u'game': 3, u'sandwich': 7, u'unc': 9, u'ate': 0, u'the': 8}
UNC played Duke in basketball = [[0 1 1 0 1 0 1 0 0 1]]
Duke lost the basketball game = [[0 1 1 1 0 1 0 0 1 0]]
I ate a sandwich = [[1 0 0 0 0 0 0 1 0 0]]


In [44]:
from sklearn.metrics.pairwise import euclidean_distances
#euclidean distances measures the similarity between two texts
counts = [x for x in dense_corp.astype(int).tolist()]
print '1st document:', corpus[0]
print '2nd document:', corpus[1]
print '3rd document:', corpus[2]
print counts
print 'Distance between 1st and 2nd documents:', euclidean_distances(counts[0], counts[1])
print 'Distance between 1st and 3rd documents:', euclidean_distances(counts[0], counts[2])
print 'Distance between 2nd and 3rd documents:', euclidean_distances(counts[1], counts[2])

1st document: UNC played Duke in basketball
2nd document: Duke lost the basketball game
3rd document: I ate a sandwich
[[0, 1, 1, 0, 1, 0, 1, 0, 0, 1], [0, 1, 1, 1, 0, 1, 0, 0, 1, 0], [1, 0, 0, 0, 0, 0, 0, 1, 0, 0]]
Distance between 1st and 2nd documents: [[ 2.44948974]]
Distance between 1st and 3rd documents: [[ 2.64575131]]
Distance between 2nd and 3rd documents: [[ 2.64575131]]


In [45]:
# STOP WORDS
from sklearn.feature_extraction.text import CountVectorizer
#corpus is a list of text documents
corpus = ['UNC played Duke in basketball', 'Duke lost the basketball game', 'I ate a sandwich']
#Tokenizes the text into strings with at least 2 character lengths and then counts them 
vectorizer = CountVectorizer(binary=True, stop_words='english')
dense_corp = vectorizer.fit_transform(corpus).todense()
print dense_corp
print vectorizer.vocabulary_
for i, val in enumerate(corpus):
    print val + " = " + str(dense_corp[i])

[[0 1 1 0 0 1 0 1]
 [0 1 1 1 1 0 0 0]
 [1 0 0 0 0 0 1 0]]
{u'duke': 2, u'basketball': 1, u'lost': 4, u'played': 5, u'game': 3, u'sandwich': 6, u'unc': 7, u'ate': 0}
UNC played Duke in basketball = [[0 1 1 0 0 1 0 1]]
Duke lost the basketball game = [[0 1 1 1 1 0 0 0]]
I ate a sandwich = [[1 0 0 0 0 0 1 0]]


In [47]:
#Stemming/Lemmatization
import nltk
#nltk.download()

showing info http://nltk.github.com/nltk_data/


True

In [48]:
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print lemmatizer.lemmatize('gathering', 'v')
print lemmatizer.lemmatize('gathering', 'n')
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print stemmer.stem('gathering')

from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag

wordnet_tags = ['n', 'v']
corpus = ['He ate the sandwiches','Every sandwich was eaten by him']
stemmer = PorterStemmer()
print 'Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus]

def lemmatize(token, tag):
    if tag[0].lower() in ['n', 'v']:
        return lemmatizer.lemmatize(token, tag[0].lower())
    return token

lemmatizer = WordNetLemmatizer()
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
print 'Lemmatized:', [[lemmatize(token, tag) for token, tag in document] for document in tagged_corpus]


gather
gathering
gather
Stemmed: [[u'He', u'ate', u'the', u'sandwich'], [u'Everi', u'sandwich', u'wa', u'eaten', u'by', u'him']]
Lemmatized: [['He', u'eat', 'the', u'sandwich'], ['Every', 'sandwich', u'be', u'eat', 'by', 'him']]


In [59]:
#TRANSFORMED
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ['The dog ate a sandwich and I ate a sandwich', 'The wizard transfigured a sandwich', 'sandwich is the capital of germany']
vectorizer = TfidfVectorizer(stop_words='english')
sandwich_copr_vectors = vectorizer.fit_transform(corpus).todense()
print sandwich_copr_vectors

[[ 0.79085927  0.          0.39542964  0.          0.46709423  0.          0.        ]
 [ 0.          0.          0.          0.          0.38537163  0.65249088
   0.65249088]
 [ 0.          0.65249088  0.          0.65249088  0.38537163  0.          0.        ]]


In [60]:
from sklearn.metrics.pairwise import euclidean_distances
#euclidean distances measures the similarity between two texts
counts = [x for x in sandwich_copr_vectors.astype(float).tolist()]
print '1st document:', corpus[0]
print '2nd document:', corpus[1]
print counts
print 'Distance between 1st and 2nd documents:', euclidean_distances(counts[0], counts[1])
print 'Distance between 1st and 3rd documents:', euclidean_distances(counts[0], counts[2])
print 'Distance between 2nd and 3rd documents:', euclidean_distances(counts[1], counts[2])

1st document: The dog ate a sandwich and I ate a sandwich
2nd document: The wizard transfigured a sandwich
[[0.7908592715238688, 0.0, 0.3954296357619344, 0.0, 0.467094225832347, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.3853716274664007, 0.652490884512534, 0.652490884512534], [0.0, 0.652490884512534, 0.0, 0.652490884512534, 0.3853716274664007, 0.0, 0.0]]
Distance between 1st and 2nd documents: [[ 1.28062105]]
Distance between 1st and 3rd documents: [[ 1.28062105]]
Distance between 2nd and 3rd documents: [[ 1.30498177]]


In [50]:
>>> from sklearn.feature_extraction.text import HashingVectorizer
>>> corpus = ['the', 'ate', 'bacon', 'cat']
#n_features is optional
>>> vectorizer = HashingVectorizer(n_features=6)
>>> print vectorizer.transform(corpus).todense()

[[-1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  1.  0.  0.]
 [ 0.  0.  0.  0. -1.  0.]
 [ 0.  1.  0.  0.  0.  0.]]


In [51]:

#Extract features from images
>>> from sklearn import datasets
>>> digits = datasets.load_digits()
>>> print 'Digit:', digits.target[0]
>>> print digits.images[0]
>>> print 'Feature vector:\n', digits.images[0].reshape(-1, 64)

Digit: 0
[[  0.   0.   5.  13.   9.   1.   0.   0.]
 [  0.   0.  13.  15.  10.  15.   5.   0.]
 [  0.   3.  15.   2.   0.  11.   8.   0.]
 [  0.   4.  12.   0.   0.   8.   8.   0.]
 [  0.   5.   8.   0.   0.   9.   8.   0.]
 [  0.   4.  11.   0.   1.  12.   7.   0.]
 [  0.   2.  14.   5.  10.  12.   0.   0.]
 [  0.   0.   6.  13.  10.   0.   0.   0.]]
Feature vector:
[[  0.   0.   5.  13.   9.   1.   0.   0.   0.   0.  13.  15.  10.  15.
    5.   0.   0.   3.  15.   2.   0.  11.   8.   0.   0.   4.  12.   0.
    0.   8.   8.   0.   0.   5.   8.   0.   0.   9.   8.   0.   0.   4.
   11.   0.   1.  12.   7.   0.   0.   2.  14.   5.  10.  12.   0.   0.
    0.   0.   6.  13.  10.   0.   0.   0.]]


In [None]:
#using points of interest
>>> import numpy as nps
>>> from skimage.feature import corner_harris, corner_peaks
>>> from skimage.color import rgb2gray
>>> import matplotlib.pyplot as plt
>>> import skimage.io as io
>>> from skimage.exposure import equalize_hist
>>> def show_corners(corners, image):
    >>> fig = plt.figure()
    >>> plt.gray()
    >>> plt.imshow(image)
    >>> y_corner, x_corner = zip(*corners)
    >>> plt.plot(x_corner, y_corner, 'or')
    >>> plt.xlim(0, image.shape[1])
    >>> plt.ylim(image.shape[0], 0)
    >>> fig.set_size_inches(np.array(fig.get_size_inches()) * 1.5)
    >>> plt.show()
    
>>> mandrill = io.imread('/home/gavin/PycharmProjects/masteringmachine-
learning/ch4/img/mandrill.png')
>>> mandrill = equalize_hist(rgb2gray(mandrill))
>>> corners = corner_peaks(corner_harris(mandrill), min_distance=2)
>>> show_corners(corners, mandrill)

In [52]:
>>> from sklearn import preprocessing
>>> import numpy as np
>>> X = np.array([
>>> [0., 0., 5., 13., 9., 1.],
>>> [0., 0., 13., 15., 10., 15.],
>>> [0., 3., 15., 2., 0., 11.]
>>> ])
>>> print preprocessing.scale(X)


[[ 0.         -0.70710678 -1.38873015  0.52489066  0.59299945 -1.35873244]
 [ 0.         -0.70710678  0.46291005  0.87481777  0.81537425  1.01904933]
 [ 0.          1.41421356  0.9258201  -1.39970842 -1.4083737   0.33968311]]
