# 使用BOW猜測文章大意

## 載入相關套件

In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

## 載入資料

In [2]:
with open('./data/news.txt','r+', encoding='UTF-8') as f:
    text = f.read()
text    

'Last summer, I came across the Bodega Project: a series of articles from Electric Literature profiling convenience stores across New York City. This intimate and colourful portrayal of the city’s inhabitants correctly recognises convenience stores as pillars of urban life. The project’s premise immediately resonated with me because of my experience living in South Korea. Convenience stores don’t have as long of a tradition there as they do in New York (one reason is that Korean cities modernised much later than in the US). Nonetheless, they’re an equally important part of life in the city. I also realised that the absence of (good) convenience stores is why, upon my return, Europe’s cities suddenly seemed so dull by comparison.\nIn Europe, we don’t really do convenience stores — or at least, we don’t do them right. Even in large cities, once the stores and supermarkets close — which is typically absurdly early in many European countries — gas stations are often the only option for tho

## BOW 轉換

In [5]:
# BOW 轉換
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text])
# 生字表
vectorizer.get_feature_names_out()

array(['000', '100', '11', '19', '23', '24', '35', '49', '80', '90',
       'abbreviation', 'about', 'above', 'absence', 'absurdly',
       'abundance', 'access', 'across', 'activities', 'activity', 'add',
       'agreeable', 'air', 'aires', 'alcohol', 'alive', 'all', 'alleyway',
       'allowed', 'almost', 'alone', 'also', 'am', 'among', 'an', 'and',
       'another', 'apartment', 'apartments', 'applies', 'apt', 'arcane',
       'are', 'area', 'articles', 'as', 'at', 'atmosphere', 'author',
       'available', 'avenue', 'average', 'away', 'ball', 'barbecue',
       'basement', 'bathroom', 'be', 'because', 'become', 'been', 'beer',
       'before', 'begin', 'behind', 'belong', 'berlin', 'bibimbap',
       'bizarre', 'bodega', 'bodegas', 'bootleg', 'bound', 'brand',
       'brands', 'breaks', 'brimming', 'buenos', 'building', 'buildings',
       'busier', 'but', 'buying', 'by', 'called', 'came', 'can', 'canned',
       'cater', 'cc', 'central', 'centres', 'chain', 'chains', 'chairs',
  

## 單字對應的出現次數

In [4]:
X.toarray()

array([[ 2,  1,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,  1,
         1,  3,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  1,  4,
         2,  1,  7, 28,  2,  1,  2,  1,  1,  1, 16,  1,  1, 13,  9,  1,
         1,  2,  1,  4,  1,  1,  1,  1,  1,  2,  3,  1,  3,  1,  1,  1,
         1,  1,  3,  1,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,
         1,  1,  1, 10,  1,  1,  7,  1,  1,  1,  2,  1,  2,  2,  1,  1,
         1,  1,  1,  1,  6,  7,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
         1,  3,  1,  1,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1, 14,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  1,  1,  2,  1,
         1,  1,  1,  1,  1,  1,  5,  1,  1,  7,  1,  1,  1,  2,  1,  1,
         2,  1,  2,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  3,
         2,  2,  8,  3,  1,  3,  1,  1,  3,  1,  1,  1,  1,  1,  1,  1,
         2,  1,  2,  1,  1,  1,  4,  1,  1,  1,  4,  1, 13,  1,  1,  1,
         5,  1,  2,  1,  1,  2,  1,  1,  2,  1,  1,  1,  1,  1, 

## 找出較常出現的單字

In [8]:
import collections

MAX_FEATURES = 20    
word_freqs = collections.Counter()
for word, freq in zip(vectorizer.get_feature_names_out(), X.toarray()[0]):
    word_freqs[word] = freq

print(f'前{MAX_FEATURES}名單字:{word_freqs.most_common(MAX_FEATURES)}')    

前20個單字:[('the', 61), ('of', 40), ('to', 38), ('in', 32), ('and', 28), ('is', 19), ('or', 17), ('are', 16), ('stores', 15), ('they', 15), ('this', 15), ('convenience', 14), ('as', 13), ('for', 13), ('by', 10), ('it', 10), ('some', 10), ('at', 9), ('that', 9), ('with', 9)]


## 考慮停用詞(Stop words)

In [9]:
MAX_FEATURES = 20    

# 轉換為 BOW
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform([text])

# 找出較常出現的單字
word_freqs = collections.Counter()
for word, freq in zip(vectorizer.get_feature_names_out(), X.toarray()[0]):
    word_freqs[word] = freq

print(f'前{MAX_FEATURES}名單字:{word_freqs.most_common(MAX_FEATURES)}')    

前20個單字:[('stores', 15), ('convenience', 14), ('seoul', 8), ('city', 7), ('don', 7), ('cities', 6), ('korea', 6), ('korean', 6), ('just', 5), ('night', 5), ('people', 5), ('average', 4), ('food', 4), ('like', 4), ('new', 4), ('outside', 4), ('store', 4), ('summer', 4), ('11', 3), ('berlin', 3)]


## 詞形還原(Lemmatization)

In [12]:
text = text.lower().replace('korean', 'korea').replace('stores', 'store')

MAX_FEATURES = 20  

# 轉換為 BOW
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform([text])

# 找出較常出現的單字
word_freqs = collections.Counter()
for word, freq in zip(vectorizer.get_feature_names_out(), X.toarray()[0]):
    word_freqs[word] = freq

print(f'前{MAX_FEATURES}名單字:{word_freqs.most_common(MAX_FEATURES)}') 

前20個單字:[('store', 19), ('convenience', 14), ('korea', 12), ('seoul', 8), ('city', 7), ('don', 7), ('cities', 6), ('just', 5), ('night', 5), ('people', 5), ('average', 4), ('food', 4), ('like', 4), ('new', 4), ('outside', 4), ('summer', 4), ('11', 3), ('berlin', 3), ('comes', 3), ('especially', 3)]
