In [9]:
#数字型特征提取
#对于多维特征，需要对其进行预处理
from sklearn import preprocessing

In [10]:
import numpy as np

In [53]:
X=np.array([[1.,-1.,2.], [2.,0.,0.], [0.,1.,-1.]])

In [54]:
#标准化，均值为0，方差为1
X_scaled = preprocessing.scale(X)

In [55]:
X_scaled

array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [56]:
#正则化，避免过拟合问题
X_normalized = preprocessing.normalize(X, norm='l2')

In [58]:
X_normalized

array([[ 0.40824829, -0.40824829,  0.81649658],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  0.70710678, -0.70710678]])

In [59]:
#归一化，全部变为0-1之间的数
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X)

In [60]:
X_train_minmax

array([[0.5       , 0.        , 1.        ],
       [1.        , 0.5       , 0.33333333],
       [0.        , 1.        , 0.        ]])

In [61]:
#下面开始是文本型特征提取

In [None]:
#文本型数据提取特征比数字型复杂，本质上是做单词切分，不同的单词作为一个新的特征


In [34]:
#name具有多个取值，每个取值作为新的特征
#water是数字型，可直接作为特征
measurements=[
    {'name': 'white hat', 'water': 90},
    {'name': 'gray hat', 'water': 75},
    {'name': 'black hat', 'water': 5},
]

In [35]:
from sklearn.feature_extraction import DictVectorizer

In [36]:
vec = DictVectorizer()

In [37]:
vec.fit_transform(measurements).toarray()

array([[ 0.,  0.,  1., 90.],
       [ 0.,  1.,  0., 75.],
       [ 1.,  0.,  0.,  5.]])

In [63]:
vec.get_feature_names()

['name=black hat', 'name=gray hat', 'name=white hat', 'water']

In [64]:
#文本特征提取有2个重要的模型，词集模型和词袋模型。

In [41]:
#对文章进行特征提取，常用词袋模式
notes = [
    'This is hack notes',
    'my name is 0pt1mus',
    'I study ML in hack notes'
]

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)

In [45]:
X = vectorizer.fit_transform(notes)

In [46]:
X

<3x10 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [47]:
vectorizer.get_feature_names()

['0pt1mus', 'hack', 'in', 'is', 'ml', 'my', 'name', 'notes', 'study', 'this']

In [48]:
X.toarray()

array([[0, 1, 0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 0, 1, 0, 1, 1, 0, 0, 0],
       [0, 1, 1, 0, 1, 0, 0, 1, 1, 0]], dtype=int64)

In [65]:
#效果验证，通过实例

In [66]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

In [67]:
iris = datasets.load_iris()

In [69]:
iris.data.shape

(150, 4)

In [70]:
iris.target.shape

(150,)

In [71]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [72]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [73]:
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0)

In [75]:
X_train.shape, Y_train.shape

((90, 4), (90,))

In [76]:
X_test.shape, Y_test.shape

((60, 4), (60,))

In [77]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, Y_train)

In [78]:
clf.score(X_test, Y_test)

0.9666666666666667

In [82]:
clf.score(([5.9, 3. , 5.1, 1.8],),([1.]))

0.0

In [83]:
clf.score(([5.9, 3. , 5.1, 1.8],),([2.]))

1.0

In [84]:
clf.score(([5.9, 3. , 5.1, 1.8],),([0.]))

0.0