In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [2]:
data= fetch_20newsgroups(remove=('headers', 'footers', 'quotes')).data
print(data[0])
# convert the text to a tf-idf weighted term-document matrix
vectorizer = TfidfVectorizer(max_features=2000, min_df=10, stop_words='english')
X = vectorizer.fit_transform(data)
idx_to_word = np.array(vectorizer.get_feature_names())

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [3]:
# apply NMF
nmf = NMF(n_components=20, solver="mu")
W = nmf.fit_transform(X)
H = nmf.components_
 
# print the topics
for i, topic in enumerate(H):
    print("Topic {}: {}".format(i + 1, ","
                                .join([str(x) for x in idx_to_word[topic.argsort()[-10:]]])))



Topic 1: going,way,really,say,said,right,did,good,people,time
Topic 2: information,appreciated,email,help,info,looking,hi,advance,mail,thanks
Topic 3: lord,church,christians,christian,believe,faith,christ,bible,jesus,god
Topic 4: algorithm,public,escrow,use,government,keys,clipper,encryption,chip,key
Topic 5: problem,cd,floppy,controller,ide,hard,drives,disk,scsi,drive
Topic 6: 20,50,price,condition,offer,shipping,10,new,sale,00
Topic 7: ms,using,running,version,use,program,files,dos,file,windows
Topic 8: teams,win,hockey,play,players,season,year,games,team,game
Topic 9: pub,cc,ftp,university,cs,soon,banks,gordon,pitt,edu
Topic 10: new,oil,speed,miles,dealer,good,engine,bike,cars,car
Topic 11: ram,color,bus,driver,vga,cards,drivers,monitor,video,card
Topic 12: ftp,new,appreciated,doesn,help,program,mean,anybody,know,does
Topic 13: things,let,sorry,pretty,want,need,people,know,think,don
Topic 14: gov,moon,earth,data,program,orbit,launch,shuttle,nasa,space
Topic 15: ll,listen,wanted,thou

## 문서 유사도 검사

In [4]:
# 20개의 뉴스 그룹, 20개의 토픽으로 18000개의 포스팅
# 이메일 텍스트 형식, 제목, 날짜등의 헤더 정보와 이메일 내용으로 구성
from sklearn.datasets import fetch_20newsgroups
import io
import pandas as pd

newsgroups_train = fetch_20newsgroups(subset='train')


In [5]:

def parseDocument(data):
    buf = io.StringIO(data)
    line=buf.readline()
    data=[]
    subject=''
    while line:
        if(line.startswith('Subject:')):
            subject = line[8:].strip()
        elif (line.startswith('Lines:')):
            lines = line[6:]
            while line :
                line = buf.readline()
                data.append(line)
        line=buf.readline()
    text = ''.join(data)
    
    return subject,text


In [6]:
textlist = []
df = pd.DataFrame(columns=['text'])
for data in newsgroups_train.data[0:100]:
    subject,text = parseDocument(data)
    df.loc[subject]=text
df.head()


Unnamed: 0,text
WHAT car is this!?,\n I was wondering if anyone out there could e...
SI Clock Poll - Final Call,NNTP-Posting-Host: carson.u.washington.edu\n\n...
PB questions...,"\nwell folks, my mac plus finally gave up the ..."
Re: Weitek P9000 ?,Distribution: world\nNNTP-Posting-Host: amber....
Re: Shuttle Launch Question,"\nFrom article <C5owCB.n3p@world.std.com>, by ..."


### Tfidf를 이용한 단어 벡터화

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['text'].tolist())
print(vectors.shape)


(100, 6194)


### NMF를 이용하여 본문에서 특성 추출

In [8]:
from sklearn.decomposition import NMF

vector_array = vectors.toarray()
nmf = NMF(n_components=40)
nmf.fit(vector_array)
features = nmf.transform(vector_array)



In [9]:
print(features.shape)
print(features[0])

(100, 40)
[0.04093039 0.         0.         0.00457672 0.         0.
 0.         0.00266823 0.09687878 0.00592236 0.         0.00780763
 0.00275928 0.00885917 0.         0.         0.         0.
 0.00744177 0.00077478 0.         0.09812755 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.00853249 0.         0.         0.
 0.         0.12439302 0.02854228 0.        ]


### Feature 정규화 -> Normalizer

In [10]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
norm_features=normalizer.fit_transform(features)

print(norm_features[0:2])

[[0.21188105 0.         0.         0.02369194 0.         0.
  0.         0.0138124  0.5015045  0.03065777 0.         0.04041715
  0.01428375 0.04586052 0.         0.         0.         0.
  0.0385232  0.00401072 0.         0.50796893 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.04416945 0.         0.         0.
  0.         0.64393525 0.14775251 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         1.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]]


In [11]:
df_features = pd.DataFrame(norm_features,index=df.index.tolist())
df_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
WHAT car is this!?,0.211881,0.000000,0.000000,0.023692,0.000000,0.000000,0.000000,0.013812,0.501505,0.030658,...,0.000000,0.000000,0.044169,0.000000,0.000000,0.000000,0.000000,0.643935,0.147753,0.0
SI Clock Poll - Final Call,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
PB questions...,0.855386,0.000000,0.000000,0.000000,0.000000,0.004546,0.000000,0.059222,0.120628,0.000000,...,0.000000,0.002008,0.365280,0.044567,0.030012,0.069580,0.000000,0.052410,0.025394,0.0
Re: Weitek P9000 ?,0.000000,0.000000,0.067072,0.000000,0.057261,0.000000,0.000000,0.004148,0.016540,0.010622,...,0.049042,0.048922,0.016858,0.399941,0.000000,0.188901,0.316544,0.000000,0.000000,0.0
Re: Shuttle Launch Question,0.000000,0.000000,0.000000,0.040227,0.027265,0.000000,0.261731,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.046432,0.105000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Re: bikes with big dogs,0.000000,0.008481,0.001763,0.031676,0.000187,0.000000,0.002459,0.007064,0.014246,0.006684,...,0.000000,0.000000,0.000000,0.004033,0.000000,0.997471,0.000000,0.000000,0.028344,0.0
"Reserve officers say demographics ignored in nominations to close naval, marine reserve centers",0.213273,0.000000,0.000000,0.000000,0.000000,0.000000,0.030509,0.000000,0.000000,0.000000,...,0.169218,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
Re: waiting for a specific event/callback,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
Re: free moral agency,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


### 문서 유사도 계산 (코사인 유사도)

In [12]:
article = df_features.loc['Terminal for sale'] # “Terminal for sale” 라는 문서의 유사한 문서 검색
similarities=df_features.dot(article) # 각 문서의 특성 행렬과 article 문서의 특성 행렬 곱하기
top=similarities.nlargest() # 큰 값순으로 정렬

KeyError: 'Terminal for sale'

In [None]:
texts = df.loc[top.index]['text'].tolist()
i = 0
for text in texts:
    print('TITLE :'+top.index[i]+" Similarities:"+ str(top[i]))
    #print(text+'\n')
    i = i+1