<a href="https://colab.research.google.com/github/maysis175/text-processing/blob/master/text_processing_task2A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import collections

In [62]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("reuters")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [63]:
from nltk.corpus import reuters as corpus

In [64]:
for n,item in enumerate(corpus.words(corpus.fileids()[0])[:300]):
    print(item, end=" ")
    if (n%25) ==24:
      print(" ")

ASIAN EXPORTERS FEAR DAMAGE FROM U . S .- JAPAN RIFT Mounting trade friction between the U . S . And Japan has raised fears  
among many of Asia ' s exporting nations that the row could inflict far - reaching economic damage , businessmen and officials said . They  
told Reuter correspondents in Asian capitals a U . S . Move against Japan might boost protectionist sentiment in the U . S . And  
lead to curbs on American imports of their products . But some exporters said that while the conflict would hurt them in the long -  
run , in the short - term Tokyo ' s loss might be their gain . The U . S . Has said it will  
impose 300 mln dlrs of tariffs on imports of Japanese electronics goods on April 17 , in retaliation for Japan ' s alleged failure to  
stick to a pact not to sell semiconductors on world markets at below cost . Unofficial Japanese estimates put the impact of the tariffs at  
10 billion dlrs and spokesmen for major electronics firms said they would virtually halt exports 

In [65]:
len(corpus.fileids())

10788

In [66]:
docs=[corpus.words(fileid) for fileid in corpus.fileids()]

# k = 100
# docs=[corpus.words(fileid) for fileid in corpus.fileids()[:k]]

print(docs[:5])
print("num of docs:", len(docs))

[['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', ...], ['CHINA', 'DAILY', 'SAYS', 'VERMIN', 'EAT', '7', '-', ...], ['JAPAN', 'TO', 'REVISE', 'LONG', '-', 'TERM', ...], ['THAI', 'TRADE', 'DEFICIT', 'WIDENS', 'IN', 'FIRST', ...], ['INDONESIA', 'SEES', 'CPO', 'PRICE', 'RISING', ...]]
num of docs: 10788


### 前処理

In [67]:
# stopwordsリストの作成
en_stop = nltk.corpus.stopwords.words('english')

In [68]:
en_stop= ["``","/",",.",".,",";","--",":",")","(",'"','&',"'",'),',',"','-','.,','.,"','.-',"?",">","<"]                  \
         +["0","1","2","3","4","5","6","7","8","9","10","11","12","86","1986","1987","000"]                                                      \
         +["said","say","u","v","mln","ct","net","dlrs","tonne","pct","shr","nil","company","lt","share","year","billion","price"]          \
         +en_stop

In [69]:
from nltk.corpus import wordnet as wn #lemmatize関数のためのimport

# Cleaning
import re

def cleaning_text(text): 
    pattern = "[0-9]+"
    text = re.sub(pattern, '', text)
    return text

def preprocess_word(word, stopwordset):
    word = word.lower()

    if word in [",","."]:
        return None

    if word in stopwordset:
        return None

    word = cleaning_text(word)

    lemma = wn.morphy(word)
    if lemma is None:
        return word
    elif lemma in stopwordset: #lemmatizeしたものがstopwordである可能性がある
        return None
    else:
        return lemma
    

def preprocess_document(document):
    document=[preprocess_word(w, en_stop) for w in document]
    document=[w for w in document if w is not None]
    return document

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]

In [70]:
# 前処理前
print(docs[0][:25]) 

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears']


In [71]:
# 前処理後
print(preprocess_documents(docs)[0][:25])

['asian', 'exporter', 'fear', 'damage', 'japan', 'rift', 'mounting', 'trade', 'friction', 'japan', 'raise', 'fear', 'among', 'many', 'asia', 'exporting', 'nation', 'row', 'could', 'inflict', 'far', 'reaching', 'economic', 'damage', 'businessmen']


### クラスタリング

In [72]:
pre_docs=preprocess_documents(docs)
pre_docs=[" ".join(doc) for doc in pre_docs]
print(pre_docs[0])

vectorizer = TfidfVectorizer(max_features=200, token_pattern=u'(?u)\\b\\w+\\b' )

tf_idf = vectorizer.fit_transform(pre_docs)



#### K-means

In [73]:
num_clusters = 8
km = KMeans(n_clusters=num_clusters, random_state = 0)

clusters = km.fit_predict(tf_idf)

In [74]:
cls_num = [0 for i in range(num_clusters)]
cls_list = []
for doc, cls in zip(pre_docs, clusters):
    cls_list.append(cls)
    cls_num[cls] = cls_num[cls] + 1
    print(cls,doc)

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
7 coast savings csa talks buying bank coast savings loan association talks federal savings loan insurance corp acquisition central savings loan association san diego central operate  branch management guidance fslic since may  coast acquisition would give entry san joaquin valley market besides strengthening presence san diego los angeles orange county area
1 bramall acquire gelco  c bramall plc statement accompany annual result propose acquire gelco k  part cost meet issue  new ordinary bramall place p acquisition satisfy initial payment  cash payment  maximum  payment make profits achieve gelco ending july  reach certain level bramall trading p lower p
5 belgium launch bond gold warrant kingdom belgium launching  swiss franc seven note warrant attach buy gold lead mananger credit suisse note coupon par payment due april  final maturity april    franc note carry  warrant two warrant require allow holder buy  gramme gold  franc entire life 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [75]:
print(set(cls_list))
print(cls_num)

{0, 1, 2, 3, 4, 5, 6, 7}
[910, 2350, 1333, 616, 1185, 2839, 716, 839]
