In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### 加载数据

In [3]:
df_train = pd.read_csv('./new_data/train_set.csv')
df_test = pd.read_csv('./new_data/test_set.csv')

In [4]:
print(df_train.shape)
print(df_test.shape)

(102277, 4)
(102277, 3)


In [5]:
df_train.head(5)

Unnamed: 0,id,article,word_seg,class
0,0,7368 1252069 365865 755561 1044285 129532 1053...,816903 597526 520477 1179558 1033823 758724 63...,14
1,1,581131 165432 7368 957317 1197553 570900 33659...,90540 816903 441039 816903 569138 816903 10343...,3
2,2,7368 87936 40494 490286 856005 641588 145611 1...,816903 1012629 957974 1033823 328210 947200 65...,12
3,3,299237 760651 299237 887082 159592 556634 7489...,563568 1239563 680125 780219 782805 1033823 19...,13
4,4,7368 7368 7368 865510 7368 396966 995243 37685...,816903 816903 816903 139132 816903 312320 1103...,12


In [6]:
df_test.head(5)

Unnamed: 0,id,article,word_seg
0,0,7368 146447 316564 42610 55736 297797 93042 53...,816903 565958 726082 764656 335008 75094 20282...
1,1,985531 473628 1044285 1121849 206763 462208 11...,729468 520477 529032 101368 335130 520477 1113...
2,2,7368 7368 7368 7368 7368 7368 7368 7368 7368 7...,816903 816903 816903 816903 816903 816903 8169...
3,3,529819 1226459 856005 1177293 663773 272235 93...,231664 1033823 524850 330478 507199 520477 618...
4,4,42610 1252069 1077049 955883 1125260 1044285 2...,545370 379223 162767 520477 1194630 1197475 11...


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102277 entries, 0 to 102276
Data columns (total 4 columns):
id          102277 non-null int64
article     102277 non-null object
word_seg    102277 non-null object
class       102277 non-null int64
dtypes: int64(2), object(2)
memory usage: 3.1+ MB


In [8]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102277 entries, 0 to 102276
Data columns (total 3 columns):
id          102277 non-null int64
article     102277 non-null object
word_seg    102277 non-null object
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [9]:
df_train['class'].describe()

count    102277.000000
mean         10.262356
std           5.370785
min           1.000000
25%           6.000000
50%          10.000000
75%          15.000000
max          19.000000
Name: class, dtype: float64

In [10]:
df_train['class'].value_counts()

3     8313
13    7907
9     7675
15    7511
18    7066
8     6972
6     6888
14    6740
19    5524
1     5375
12    5326
10    4963
4     3824
11    3571
16    3220
17    3094
7     3038
2     2901
5     2369
Name: class, dtype: int64

In [11]:
print(len(df_train['article'][0]))
print(len(df_train['word_seg'][0]))

7453
4484


In [12]:
df_train_y = df_train['class']
df_train_X = df_train.drop(['class'], axis=1)
print(df_train_y.head(5))
print(df_train_X.head(5))

0    14
1     3
2    12
3    13
4    12
Name: class, dtype: int64
   id                                            article  \
0   0  7368 1252069 365865 755561 1044285 129532 1053...   
1   1  581131 165432 7368 957317 1197553 570900 33659...   
2   2  7368 87936 40494 490286 856005 641588 145611 1...   
3   3  299237 760651 299237 887082 159592 556634 7489...   
4   4  7368 7368 7368 865510 7368 396966 995243 37685...   

                                            word_seg  
0  816903 597526 520477 1179558 1033823 758724 63...  
1  90540 816903 441039 816903 569138 816903 10343...  
2  816903 1012629 957974 1033823 328210 947200 65...  
3  563568 1239563 680125 780219 782805 1033823 19...  
4  816903 816903 816903 139132 816903 312320 1103...  


In [13]:
df_train_X.head(5)

Unnamed: 0,id,article,word_seg
0,0,7368 1252069 365865 755561 1044285 129532 1053...,816903 597526 520477 1179558 1033823 758724 63...
1,1,581131 165432 7368 957317 1197553 570900 33659...,90540 816903 441039 816903 569138 816903 10343...
2,2,7368 87936 40494 490286 856005 641588 145611 1...,816903 1012629 957974 1033823 328210 947200 65...
3,3,299237 760651 299237 887082 159592 556634 7489...,563568 1239563 680125 780219 782805 1033823 19...
4,4,7368 7368 7368 865510 7368 396966 995243 37685...,816903 816903 816903 139132 816903 312320 1103...


### 使用 train_test_split 对训练数据进行切分

In [18]:
train_X, test_X, train_y, test_y = train_test_split(df_train_X, df_train_y, test_size=0.3, random_state=2019)

### TF-IDF
#### TF(t,d) 即term frequency, 表示term出现在document的频率
#### IDF(t) 即inverse document frequency，逆文档频率，用来衡量term普遍重要性的指标，也即独特性

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_X['word_seg'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [25]:
tfidf_matrix_train_X = vectorizer.transform(train_X['word_seg'])

In [28]:
tfidf_matrix_train_X.shape

(71593, 719738)

### word2vec
#### Skip-gram: 根据当前词语预测上下文词语出现的频率
#### CBOW: 根据上下文词语预测当前词语出现的频率
#### 词向量的本质是one-hot encoding的低维稠密表示 N->K

In [29]:
from gensim.models.word2vec import Word2Vec

In [30]:
df_train_X['word_seg'].head(5)

0    816903 597526 520477 1179558 1033823 758724 63...
1    90540 816903 441039 816903 569138 816903 10343...
2    816903 1012629 957974 1033823 328210 947200 65...
3    563568 1239563 680125 780219 782805 1033823 19...
4    816903 816903 816903 139132 816903 312320 1103...
Name: word_seg, dtype: object

In [31]:
sentences = list(df_train_X['word_seg'].apply(lambda x: x.strip().split()))

In [32]:
sentences[0]

['816903',
 '597526',
 '520477',
 '1179558',
 '1033823',
 '758724',
 '632718',
 '422098',
 '520477',
 '414956',
 '625597',
 '1203094',
 '441513',
 '596474',
 '660569',
 '995362',
 '924085',
 '1278762',
 '1155376',
 '1109510',
 '823120',
 '1275770',
 '1203094',
 '520477',
 '831818',
 '970779',
 '1179558',
 '758724',
 '1033823',
 '367432',
 '995362',
 '340401',
 '599826',
 '520477',
 '907218',
 '1080651',
 '99188',
 '188983',
 '1238335',
 '264536',
 '834740',
 '321830',
 '327386',
 '469755',
 '295102',
 '520477',
 '106993',
 '663234',
 '670118',
 '566120',
 '960860',
 '1033823',
 '1032772',
 '528558',
 '1108771',
 '643003',
 '520477',
 '1191581',
 '560181',
 '335962',
 '1276143',
 '995362',
 '748896',
 '776840',
 '910464',
 '520477',
 '707234',
 '235866',
 '1113816',
 '814750',
 '636128',
 '520477',
 '1241273',
 '756099',
 '658505',
 '477703',
 '520477',
 '653967',
 '627858',
 '147022',
 '572824',
 '995362',
 '816903',
 '566120',
 '1255841',
 '785043',
 '1025743',
 '197563',
 '990423',
 

In [None]:
train_model = Word2Vec(sentences=sentences, size=300, window=5, min_count=5, workers=4, sg=0, iter=5)

#### LogisticRegression

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train_X['word_seg'], train_y)
prediction = lr.predict(test_X['word_seg'])
accuracy_score(test_y, prediction)

#### SVM

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(train_X['word_seg'], train_y)
prediction = svc.predict(test_X['word_seg'])
accuracy_score(test_y, prediction)