# 1. Classic Methods



Contents
1. Bag of Words (BOW)
2. TF-IDF

## Generating Count Vectors Based on BOW (Bag of Words)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from nltk.corpus import movie_reviews

print('#review count:', len(movie_reviews.fileids())) # Returns the IDs of movie review documents
print('#samples of file ids:', movie_reviews.fileids()[:10]) # Outputs the first 10 IDs
print('#categories of reviews:', movie_reviews.categories()) # Label, i.e., classification of reviews as positive or negative
print('#num of "neg" reviews:', len(movie_reviews.fileids(categories='neg'))) # Returns the IDs of documents labeled as negative
print('#num of "pos" reviews:', len(movie_reviews.fileids(categories='pos'))) # Returns the IDs of documents labeled as positive
fileid = movie_reviews.fileids()[0] # Returns the ID of the first document
print('#id of the first review:', fileid)
print('#first review content:\n', movie_reviews.raw(fileid)[:200]) # Outputs the first 200 characters of the content from the first document
print()
print('#sentence tokenization result:', movie_reviews.sents(fileid)[:2]) # The first two sentences after sentence tokenization of the first document
print('#word tokenization result:', movie_reviews.words(fileid)[:20]) # The first 20 words after word tokenization of the first document

#review count: 2000
#samples of file ids: ['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']
#categories of reviews: ['neg', 'pos']
#num of "neg" reviews: 1000
#num of "pos" reviews: 1000
#id of the first review: neg/cv000_29416.txt
#first review content:
 plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
w

#sentence tokenization result: [['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.'], ['they', 'get', 'into', 'an', 'accident', '.']]
#word tokenization result: ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', '

In typical text mining processes, text representation is not implemented directly, but libraries such as Scikit-learn are used. However, this time, let's implement it manually to gain a precise understanding of Bag of Words (BOW).

In [None]:
# This will create a list where each document is tokenized into words, resulting in a list of tokens for each document.
documents = [list(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids()] # Using fileids(), retrieve the IDs of all documents, and for each ID, use words() to get the tokenized results and create a list.
print(documents[0][:50]) # Outputs the first 50 words of the first document

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch']


In [None]:
# Initialize an empty dictionary to store word counts
word_count = {}

# Iterate through each document (list of words) in the documents list
for text in documents:
    # Iterate through each word in the document
    for word in text:
        # Update the word count in the dictionary.
        # If the word is not in the dictionary, add it with a count of 1.
        # Otherwise, increment its count by 1.
        word_count[word] = word_count.get(word, 0) + 1

# Sort the words in the word_count dictionary based on their frequency in descending order.
sorted_features = sorted(word_count, key=word_count.get, reverse=True)

# Print the top 10 most frequent words and their counts
for word in sorted_features[:10]:
    # Print the word and its count, separated by commas
    print(f"count of '{word}': {word_count[word]}", end=', ')

count of ',': 77717, count of 'the': 76529, count of '.': 65876, count of 'a': 38106, count of 'and': 35576, count of 'of': 34123, count of 'to': 31937, count of ''': 30585, count of 'is': 25195, count of 'in': 21822, 

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords # Words generally not targeted for analysis

tokenizer = RegexpTokenizer("[\w']{3,}") # Defines the tokenizer using a regular expression
english_stops = set(stopwords.words('english')) # Retrieves English stopwords

# Uses raw() instead of words() to get the original text
documents = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]

# Performs both tokenization and stopwords removal simultaneously
tokens = [[token for token in tokenizer.tokenize(doc) if token not in english_stops] for doc in documents]

word_count = {}
for text in tokens:
    for word in text:
        word_count[word] = word_count.get(word, 0) + 1

sorted_features = sorted(word_count, key=word_count.get, reverse=True)

print('num of words/features:', len(sorted_features))
for word in sorted_features[:10]:
    print(f"count of '{word}': {word_count[word]}", end=', ')

num of words/features: 43030
count of 'film': 8935, count of 'one': 5791, count of 'movie': 5538, count of 'like': 3690, count of 'even': 2564, count of 'time': 2409, count of 'good': 2407, count of 'story': 2136, count of 'would': 2084, count of 'much': 2049, 

- You can use all of the features above, but we decided to extract only the top 1,000 most frequent ones to use as the final features representing the document.
- The important point here is that the feature set has an order, and this order determines the count vector values for the document.

In [None]:
word_features = sorted_features[:1000] # Extracts the top 1000 most frequent words to construct features

Now, we will create a function that converts the given document into a feature vector, i.e., a count vector. To verify that the function works properly, let's create input and output examples as shown below and test it.

In [None]:
def document_features(document, word_features):
    word_count = {}  # Initialize an empty dictionary to store word counts
    for word in document:  # First, calculates the frequency of words in the document
        word_count[word] = word_count.get(word, 0) + 1  # Update word count or set to 1 if the word is not yet in the dictionary

    features = []  # Initialize an empty list to store the feature vector
    for word in word_features:  # Adds the calculated frequency of each word in word_features to the feature list
        features.append(word_count.get(word, 0))  # Insert 0 if the word is not in the document
    return features  # Return the feature vector

word_features_ex = ['one', 'two', 'teen', 'couples', 'solo'] # This is a list of words that will be used as the feature vector. The frequency of these words in the document will be measured.
doc_ex = ['two', 'two', 'couples'] # This is the document you want to convert. It is a list of words. For example, this document contains the word 'two' twice and 'couples' once.
print(document_features(doc_ex, word_features_ex))#gives the number of appearances

[0, 2, 0, 1, 0]


Since it has been confirmed to work well, let's now apply it to the entire review set, and print the number of extracted features and the first 20 features from the feature set of the first review document. At this time, to know which word corresponds to the value in the vector, we will output it along with the words from word_features in the matching order.

In [None]:
# Create a list of word frequency vectors for each document in 'tokens'
# 'word_features' is the list of words whose frequencies we want to check
feature_sets = [document_features(d, word_features) for d in tokens]

# Print the first 20 word frequencies from the first document's feature vector
for i in range(20):
    # Print the word from 'word_features' and its frequency in the first document
    print(f'({word_features[i]}, {feature_sets[0][i]})', end=', ')

(film, 5), (one, 3), (movie, 6), (like, 3), (even, 3), (time, 0), (good, 2), (story, 0), (would, 1), (much, 0), (also, 1), (get, 3), (character, 1), (two, 2), (well, 1), (first, 0), (characters, 1), (see, 2), (way, 3), (make, 5), 

- The words in word_features are sorted in order of frequency.
Therefore, we can see that many of the first 20 words have non-zero count values.
- However, as we move towards the end, we can predict that more words will have a count of 0. To confirm this, if we print the last 20 values as shown below, we can observe that they are all 0.

In [None]:
print(feature_sets[0])

[5, 3, 6, 3, 3, 0, 2, 0, 1, 0, 1, 3, 1, 2, 1, 0, 1, 2, 3, 5, 1, 2, 2, 1, 2, 1, 0, 2, 0, 0, 0, 0, 1, 2, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 1, 0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 5, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
print(feature_sets[0][-20:]) # Print only the last 20 feature sets , limitations

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
print(feature_sets[0][:20]) # Print only the last 20 feature sets

[5, 3, 6, 3, 3, 0, 2, 0, 1, 0, 1, 3, 1, 2, 1, 0, 1, 2, 3, 5]


## Generating Count Vectors Using Scikit-learn

### CountVectorizer

http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction  

- If you want to represent documents based on counts and analyze them using machine learning, it's much more convenient to use Scikit-learn's text-related libraries.
- Scikit-learn supports its own tokenizer, so users do not need to tokenize separately. However, if you want to improve performance with more fine-tuning, you can define the tokenizer as a function and use it within Scikit-learn.
- In the case of Korean, you must use a separate tokenizer because morphological analysis needs to be performed with KoNLPy.

In [None]:
# Data preparation, extracting raw text using movie_reviews.raw()
reviews = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]

In [None]:
reviews[:5]

['plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience memb

When creating a CountVectorizer object, you can use the vocabulary parameter to construct vectors using only the words present in the previously created word_features.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# cv = CountVectorizer() # When using default values for all parameters

# When specifying the feature set using the previously generated word_features
cv = CountVectorizer(vocabulary=word_features)

# cv = CountVectorizer(max_features=1000) # When not specifying the feature set but setting the maximum number of features
print(cv) # Checks the arguments used in the object

CountVectorizer(vocabulary=['film', 'one', 'movie', 'like', 'even', 'time',
                            'good', 'story', 'would', 'much', 'also', 'get',
                            'character', 'two', 'well', 'first', 'characters',
                            'see', 'way', 'make', 'life', 'really', 'films',
                            'plot', 'little', 'people', 'could', 'bad', 'scene',
                            'never', ...])


- Once the object is created, you can generate the feature set and create the count vector using fit_transform() as shown below.
- You can see that the output of get_feature_names_out() has the same words and order as word_features.


In [None]:
reviews_cv = cv.fit_transform(reviews) # Learn and transform using reviews for the count vector
print(cv.get_feature_names_out()[:20]) # Return the feature names used in the count vector
print(word_features[:20]) # Output for comparison

['film' 'one' 'movie' 'like' 'even' 'time' 'good' 'story' 'would' 'much'
 'also' 'get' 'character' 'two' 'well' 'first' 'characters' 'see' 'way'
 'make']
['film', 'one', 'movie', 'like', 'even', 'time', 'good', 'story', 'would', 'much', 'also', 'get', 'character', 'two', 'well', 'first', 'characters', 'see', 'way', 'make']


In [None]:
print('#type of count vectors:', type(reviews_cv))
print('#shape of count vectors:', reviews_cv.shape)
print('#sample of count vector:')
print(reviews_cv[0, :10])

#type of count vectors: <class 'scipy.sparse._csr.csr_matrix'>
#shape of count vectors: (2000, 1000)
#sample of count vector:
  (0, 0)	6
  (0, 1)	3
  (0, 2)	6
  (0, 3)	3
  (0, 4)	3
  (0, 6)	2
  (0, 8)	1


In [None]:
reviews_cv

<2000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 252984 stored elements in Compressed Sparse Row format>

In [None]:
print(feature_sets[0][:20]) # Count vector manually calculated earlier
print(reviews_cv.toarray()[0, :20]) # Output the first 20 elements of the transformed result's first feature set

[5, 3, 6, 3, 3, 0, 2, 0, 1, 0, 1, 3, 1, 2, 1, 0, 1, 2, 3, 5]
[6 3 6 3 3 0 2 0 1 0 1 3 2 2 1 0 1 2 3 5]


In [None]:
for word, count in zip(cv.get_feature_names_out()[:20], reviews_cv[0].toarray()[0, :20]):
    print(f'{word}:{count}', end=', ')

film:6, one:3, movie:6, like:3, even:3, time:0, good:2, story:0, would:1, much:0, also:1, get:3, character:2, two:2, well:1, first:0, characters:1, see:2, way:3, make:5, 

## Converting Korean Text into Count Vectors


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Unstructured Data Analysis - Code Practice/[Week 6] Text Representation/data/daum_movie_review.csv')
df.head(10)

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워
5,나는 재밌게 봄,10,2018.10.14,인피니티 워
6,0.5점은 줄 수 없냐?,0,2018.10.10,인피니티 워
7,헐..다 죽었어....나중에 앤트맨 보다가도 깜놀...,10,2018.10.08,인피니티 워
8,충격 결말,9,2018.10.06,인피니티 워
9,응집력,8,2018.10.05,인피니티 워


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
daum_cv = CountVectorizer(max_features=1000)

daum_DTM = daum_cv.fit_transform(df.review) # Learn and transform using reviews for the count vector
print(daum_cv.get_feature_names_out()[:100]) # Return the feature names used in the count vector

['10점' '18' '1987' '1도' '1점' '1점도' '2시간' '2시간이' '2편' '5점' '6점' '7점' '8점'
 'cg' 'cg가' 'cg는' 'cg도' 'cg만' 'good' 'of' 'ㅋㅋ' 'ㅋㅋㅋ' 'ㅋㅋㅋㅋ' 'ㅎㅎ' 'ㅎㅎㅎ'
 'ㅜㅜ' 'ㅠㅠ' 'ㅠㅠㅠ' 'ㅡㅡ' '가는' '가는줄' '가면' '가서' '가슴' '가슴아픈' '가슴이' '가장' '가족'
 '가족과' '가족들과' '가족의' '가족이' '가지고' '간만에' '갈수록' '감독' '감독님' '감독은' '감독의' '감독이'
 '감동' '감동과' '감동도' '감동은' '감동을' '감동이' '감동입니다' '감동적' '감동적이고' '감동적인' '감사드립니다'
 '감사합니다' '감정이' '갑자기' '갔는데' '갔다가' '강철비' '강추' '강추합니다' '같고' '같네요' '같다' '같습니다'
 '같아' '같아요' '같은' '같은데' '같음' '같이' '개연성' '개연성이' '개인적으로' '거의' '겁나' '것도' '것은'
 '것을' '것이' '것이다' '겨울왕국' '결국' '결말' '결말이' '계속' '고맙습니다' '곤지암' '공포' '공포를'
 '공포영화' '관객']


In [None]:
!pip install konlpy



In [None]:
from konlpy.tag import Okt # Import Twitter morphological analyzer from konlpy
twitter_tag = Okt()

print('# Entire morphological analysis result:', twitter_tag.morphs(df.review[1]))
print('# Extract only nouns:', twitter_tag.nouns(df.review[1]))
print('# Part-of-speech tagging result:', twitter_tag.pos(df.review[1]))

# Entire morphological analysis result: ['몰입', '할수밖에', '없다', '.', '어렵게', '생각', '할', '필요없다', '.', '내', '가', '전투', '에', '참여', '한', '듯', '손', '에', '땀', '이남', '.']
# Extract only nouns: ['몰입', '생각', '내', '전투', '참여', '듯', '손', '땀', '이남']
# Part-of-speech tagging result: [('몰입', 'Noun'), ('할수밖에', 'Verb'), ('없다', 'Adjective'), ('.', 'Punctuation'), ('어렵게', 'Adjective'), ('생각', 'Noun'), ('할', 'Verb'), ('필요없다', 'Adjective'), ('.', 'Punctuation'), ('내', 'Noun'), ('가', 'Josa'), ('전투', 'Noun'), ('에', 'Josa'), ('참여', 'Noun'), ('한', 'Determiner'), ('듯', 'Noun'), ('손', 'Noun'), ('에', 'Josa'), ('땀', 'Noun'), ('이남', 'Noun'), ('.', 'Punctuation')]


In [None]:
def my_tokenizer(doc):
    return [token for token, pos in twitter_tag.pos(doc) if pos in ['Noun', 'Verb', 'Adjective']]

print("My tokenizer result:", my_tokenizer(df.review[1]))

My tokenizer result: ['몰입', '할수밖에', '없다', '어렵게', '생각', '할', '필요없다', '내', '전투', '참여', '듯', '손', '땀', '이남']


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Specify the tokenizer and the maximum number of features
daum_cv = CountVectorizer(max_features=1000, tokenizer=my_tokenizer)
# If you only want to extract nouns, you can directly specify 'twitter_tag.nouns' in the tokenizer

daum_DTM = daum_cv.fit_transform(df.review) # Learn and transform using reviews for the count vector
print(daum_cv.get_feature_names_out()[:100]) # Return the feature names used in the count vector



['가' '가는' '가는줄' '가면' '가서' '가슴' '가장' '가족' '가족영화' '가지' '가치' '각색' '간' '간다'
 '간만' '갈' '갈수록' '감' '감독' '감동' '감사' '감사합니다' '감상' '감성' '감정' '감탄' '갑자기' '갔는데'
 '갔다' '갔다가' '강' '강철' '강추' '같고' '같네요' '같다' '같습니다' '같아' '같아요' '같은' '같은데'
 '같음' '개' '개그' '개봉' '개연' '개인' '거' '거기' '거리' '거의' '걱정' '건' '건가' '건지' '걸'
 '겁니다' '것' '게' '겨울왕국' '결론' '결말' '경찰' '경험' '계속' '고' '고맙습니다' '고민' '고생' '곤지암'
 '곳' '공감' '공포' '공포영화' '과' '과거' '관' '관객' '관객수' '관람' '광주' '괜찮은' '교훈' '구성'
 '국내' '국민' '군인' '군함도' '굿' '권선' '귀신' '그' '그것' '그게' '그날' '그냥' '그닥' '그대로'
 '그때' '그래픽']


In [None]:
print(repr(daum_DTM))
print(110800/(14725*1000))

<14725x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 110800 stored elements in Compressed Sparse Row format>
0.007524617996604414


In [None]:
for word, count in zip(daum_cv.get_feature_names_out(), daum_DTM[1].toarray()[0]):
    if count > 0:
        print(word, ':', count, end=', ')

내 : 1, 듯 : 1, 몰입 : 1, 생각 : 1, 손 : 1, 없다 : 1, 할 : 1, 

## Improve performance with TF-IDF

- In a count vector, frequency acts as a kind of weight. In other words, in a count vector, the higher the frequency, the more the word tends to be treated as important. However, there is one problem we haven't considered yet. If a word appears in every document, is it really an important word? (e.g., a, an, the...)

- In other words, words that appear in every document are not particularly important. To put it differently, the more documents a word appears in, the less important it becomes. This concept is reflected in the count vector through TF-IDF (Term Frequency-Inverse Document Frequency).

- To reuse the count matrix created earlier, use the TfidfTransformer.
- This helps save time.

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer

reviews_tfidf = transformer.fit_transform(reviews_cv)
print('# Shape of TF-IDF matrix:', reviews_tfidf.shape) # Check that the shape of the TF-IDF matrix matches the count matrix

# Output the first 20 values of the count vector for the first review
print('# 20 count scores of the first review:', reviews_cv[0].toarray()[0][:20])
# Output the first 20 values of the TF-IDF vector for the first review
print('# 20 TF-IDF scores of the first review:', reviews_tfidf[0].toarray()[0][:20])

# Shape of TF-IDF matrix: (2000, 1000)
# 20 count scores of the first review: [6 3 6 3 3 0 2 0 1 0 1 3 2 2 1 0 1 2 3 5]
# 20 TF-IDF scores of the first review: [0.13556199 0.06700076 0.14998642 0.0772298  0.08608998 0.
 0.0609124  0.         0.03126552 0.         0.03242315 0.09567082
 0.06575035 0.06518293 0.03225625 0.         0.0345017  0.06863314
 0.10042383 0.16727495]


* If you want to create a TF-IDF matrix from the beginning, you can use TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(vocabulary=word_features)
reviews_tf = tf.fit_transform(reviews)

In [None]:
reviews_tf

<2000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 252984 stored elements in Compressed Sparse Row format>