In [4]:
import pandas as pd
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn import naive_bayes
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from snownlp import SnowNLP

In [5]:
a = "虽然跌的比预期想的多，不改看6，望10。"
b = "买了五万股，想一步买到位的，确套了，持有，还是割肉呢？"
c = "该公司是业内龙头，只是业绩不稳定。"

In [8]:
data = []
data.append(" ".join(jieba.lcut(a)))
data.append(" ".join(jieba.lcut(b)))
data.append(" ".join(jieba.lcut(c)))

In [9]:
# 词袋模型-矩阵
vect = CountVectorizer()
wordmtx = vect.fit_transform(data)
print(wordmtx.shape)
# print(wordmtx)

(3, 17)


In [11]:
print(wordmtx)

  (0, 13)	1
  (0, 15)	1
  (0, 3)	1
  (0, 0)	1
  (1, 2)	1
  (1, 1)	1
  (1, 7)	1
  (1, 11)	1
  (1, 10)	1
  (1, 14)	1
  (1, 8)	1
  (2, 6)	1
  (2, 4)	1
  (2, 16)	1
  (2, 9)	1
  (2, 5)	1
  (2, 12)	1


In [12]:
wordmtx.todense()

matrix([[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
        [0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1]], dtype=int64)

In [14]:
vect.vocabulary_

{'10': 0,
 '一步': 1,
 '万股': 2,
 '不改': 3,
 '业内': 4,
 '业绩': 5,
 '公司': 6,
 '到位': 7,
 '割肉': 8,
 '只是': 9,
 '持有': 10,
 '确套': 11,
 '稳定': 12,
 '虽然': 13,
 '还是': 14,
 '预期': 15,
 '龙头': 16}

In [16]:
vect.get_feature_names()

['10',
 '一步',
 '万股',
 '不改',
 '业内',
 '业绩',
 '公司',
 '到位',
 '割肉',
 '只是',
 '持有',
 '确套',
 '稳定',
 '虽然',
 '还是',
 '预期',
 '龙头']

In [17]:
wordmtx.toarray()

array([[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1]], dtype=int64)

In [20]:
text_list = []
text_list.append(a)
text_list.append(b)
text_list.append(c)
text_list

['虽然跌的比预期想的多，不改看6，望10。', '买了五万股，想一步买到位的，确套了，持有，还是割肉呢？', '该公司是业内龙头，只是业绩不稳定。']

In [21]:
pd.DataFrame(text_list, columns=["text"])

Unnamed: 0,text
0,虽然跌的比预期想的多，不改看6，望10。
1,买了五万股，想一步买到位的，确套了，持有，还是割肉呢？
2,该公司是业内龙头，只是业绩不稳定。


In [19]:
pd.DataFrame(wordmtx.toarray(), columns=vect.get_feature_names())

Unnamed: 0,10,一步,万股,不改,业内,业绩,公司,到位,割肉,只是,持有,确套,稳定,虽然,还是,预期,龙头
0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
1,0,1,1,0,0,0,0,1,1,0,1,1,0,0,1,0,0
2,0,0,0,0,1,1,1,0,0,1,0,0,1,0,0,0,1


In [23]:
d = [{"word": a, "number": b} for a,b in vect.vocabulary_.items()]
d

[{'number': 13, 'word': '虽然'},
 {'number': 8, 'word': '割肉'},
 {'number': 6, 'word': '公司'},
 {'number': 12, 'word': '稳定'},
 {'number': 14, 'word': '还是'},
 {'number': 15, 'word': '预期'},
 {'number': 10, 'word': '持有'},
 {'number': 9, 'word': '只是'},
 {'number': 4, 'word': '业内'},
 {'number': 16, 'word': '龙头'},
 {'number': 0, 'word': '10'},
 {'number': 1, 'word': '一步'},
 {'number': 5, 'word': '业绩'},
 {'number': 7, 'word': '到位'},
 {'number': 11, 'word': '确套'},
 {'number': 3, 'word': '不改'},
 {'number': 2, 'word': '万股'}]

In [25]:
pd.DataFrame(d, columns=["word", "number"])

Unnamed: 0,word,number
0,虽然,13
1,割肉,8
2,公司,6
3,稳定,12
4,还是,14
5,预期,15
6,持有,10
7,只是,9
8,业内,4
9,龙头,16


In [27]:
wordmtx

<3x17 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [29]:
pd.DataFrame(wordmtx.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
1,0,1,1,0,0,0,0,1,1,0,1,1,0,0,1,0,0
2,0,0,0,0,1,1,1,0,0,1,0,0,1,0,0,0,1
