## python实现TF-IDF

In [17]:
from collections import Counter
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import math

In [9]:
def tf(word,doc_count):
    return doc_count[word]/sum(doc_count.values())

def doc_contain_word(word,doc_list):
    return sum(1 for doc_count in doc_list if word in doc_count)

def idf(word,doc_list):
    return math.log(len(count_list)/(1+doc_contain_word(word,doc_list)))

def tfidf(word,doc_count,doc_list):
    return tf(word,doc_count)*idf(word,doc_list)

## doc_count 为字典形式，统计了词频信息{'python':23,...}
## doc_list 为列表形式[list1,list2,list3]

In [3]:
# 读取数据
df_train = pd.read_csv('D:/NLP_datasets/daguan/new_data/train_set.csv',nrows=5000)
df_test = pd.read_csv('D:/NLP_datasets/daguan/new_data/test_set.csv',nrows=5000)
                      

In [14]:
df_train.head()

Unnamed: 0,﻿id,article,word_seg,class
0,0,7368 1252069 365865 755561 1044285 129532 1053...,816903 597526 520477 1179558 1033823 758724 63...,14
1,1,581131 165432 7368 957317 1197553 570900 33659...,90540 816903 441039 816903 569138 816903 10343...,3
2,2,7368 87936 40494 490286 856005 641588 145611 1...,816903 1012629 957974 1033823 328210 947200 65...,12
3,3,299237 760651 299237 887082 159592 556634 7489...,563568 1239563 680125 780219 782805 1033823 19...,13
4,4,7368 7368 7368 865510 7368 396966 995243 37685...,816903 816903 816903 139132 816903 312320 1103...,12


In [12]:
## 计算词频（doc_count）
np_train_seg = df_train['word_seg'].as_matrix()
count_list=[]
for doc in np_train_seg:
    doc = list(map(int,doc.split(' ')))
    count = Counter(doc)
    count_list.append(count)

In [18]:
for i, count in enumerate(count_list[:5]):
    print("Top words in document {}".format(i + 1))
    scores = {word: tfidf(word, count, count_list) for word in count}
    sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    for word, score in sorted_words[:3]:
        print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))


Top words in document 1
	Word: 1241273, TF-IDF: 0.0524
	Word: 1255841, TF-IDF: 0.04426
	Word: 426552, TF-IDF: 0.04407
Top words in document 2
	Word: 266161, TF-IDF: 0.09965
	Word: 432549, TF-IDF: 0.06191
	Word: 585149, TF-IDF: 0.02811
Top words in document 3
	Word: 710270, TF-IDF: 0.13465
	Word: 856385, TF-IDF: 0.04492
	Word: 701424, TF-IDF: 0.04167
Top words in document 4
	Word: 1239563, TF-IDF: 0.10506
	Word: 1257332, TF-IDF: 0.07642
	Word: 1261133, TF-IDF: 0.06552
Top words in document 5
	Word: 2974, TF-IDF: 0.06048
	Word: 22921, TF-IDF: 0.05859
	Word: 199259, TF-IDF: 0.02169


## sklearn 提取TF-IDF特征

In [19]:
df_train.drop('article',inplace=True,axis=1)
df_test.drop('article',inplace=True,axis=1)

In [21]:
y_train = (df_train['class']).values

In [25]:
# 2.特征工程
vectorizer = TfidfVectorizer(ngram_range=(1,2),min_df=3,max_df=0.9,sublinear_tf=True)# 忽略词数低于3且词频高于0.9的值
# ngram:设置划词数目范围1-2个
vectorizer.fit(df_train['word_seg'])
x_train = vectorizer.transform(df_train['word_seg'])
x_test = vectorizer.transform(df_test['word_seg'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [27]:
# 3.保存至本地
data = (x_train,y_train,x_test)
fp = open('D:\\NLP_datasets\\daguan\\data_w_tfidf.pkl','wb')
pickle.dump(data,fp)
fp.close()

In [23]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to CountVectorizer followed by TfidfTransformer.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : string {'filename', 'file', 'content'}
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |      the raw content to analyze.
 |  
 |      If 'file', the sequence items must have a 'read' method (file-like
 |      object) that is called to fetch the bytes in memory.
 |  
 |      Otherwise the input is expected to be the sequence strings or
 |      bytes items are expected to be analyzed directly.
 |  
 |  encoding : string, 'utf-8' by default.
 |      If bytes or files are given to analyze, this encoding is used to
 |      decode.
 |