## Based on Context Window(Co-occurrence)

In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
with open('review.sorted.uniq.refined.tsv.text.tok') as f:
    lines = [l.strip() for l in f.read().splitlines() if l.strip()]

### Define methods

In [6]:
def get_term_frequency(document):
    term_freq = {}
    
    words = document.split()
    
    for w in words:
        term_freq[w] = 1 + (0 if term_freq.get(w) is None else term_freq[w])
        
    return term_freq

In [7]:
def get_context_counts(lines, vocab, w_size=2):
    context_cnt = defaultdict(int)
    
    for line in lines:
        words = line.split()
        
        for i, w in enumerate(words):
            if w in vocab:
                for c in words[i - w_size:i + w_size]:
                    if w != c:
                        context_cnt[(w, c)] += 1
                        
    return context_cnt

In [8]:
def get_co_occurrence_df(context_cnt, vocab):
    data = []
    
    for word1 in vocab:
        row = []
        
        for word2 in vocab:
            try:
                count = context_cnt[(word1, word2)]
            except KeyError:
                count = 0
            row.append(count)
            
        data.append(row)
        
    return pd.DataFrame(data, index=vocab, columns=vocab)

### Call methods

Count frequency of each word.

In [9]:
term_freq = pd.Series(
get_term_frequency(' '.join(lines))).sort_values(ascending=False)

term_freq

.        86303
고        49631
이        44952
하        42916
좋        34589
         ...  
ㅠㅠ이대로        1
마우           1
세무           1
음량           1
뻬뚤한          1
Length: 30084, dtype: int64

In [10]:
vertor_size = 800

In [11]:
term_freq.index[:vertor_size]

Index(['.', '고', '이', '하', '좋', '네요', '도', '에', '는', '가',
       ...
       '한쪽', '엄마', '가을', '요청', 'ㅁ', '마', '국산', '보풀', '세일', '싸구려'],
      dtype='object', length=800)

In [13]:
context_cnt = pd.Series(
get_context_counts(lines, term_freq.index[:vertor_size], w_size=4))


context_cnt

라고  비지떡     31
    ".       1
    200      2
    ml       5
    판매      16
          ... 
았   ㅍ        1
감사  ㅍㅍ       2
고   수수     106
수고  수수     212
    고수       3
Length: 1047278, dtype: int64

In [None]:
df = get_co_occurrence_df(context_cnt, term_freq.index[:vertor_size])

df