In [1]:
import pandas as pd 
from matplotlib import pyplot as plt
import nltk 
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/not_joon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
PATH = r'/Users/not_joon/NLP_exercise/data/reviews/reviews.csv'

reviews = pd.read_csv(PATH)
reviews.drop(reviews.columns[[0,1,2,3,4,5,6,7,8]], axis = 1, inplace = True)

In [3]:
reviews.head(10)

Unnamed: 0,Text
0,I have bought several of the Vitality canned d...
1,Product arrived labeled as Jumbo Salted Peanut...
2,This is a confection that has been around a fe...
3,If you are looking for the secret ingredient i...
4,Great taffy at a great price. There was a wid...
5,I got a wild hair for taffy and ordered this f...
6,This saltwater taffy had great flavors and was...
7,This taffy is so good. It is very soft and ch...
8,Right now I'm mostly just sprouting this so my...
9,This is a very healthy dog food. Good for thei...


In [4]:
from nltk.tokenize import sent_tokenize

def split_sentences(reviews):
    n_reviews = len(reviews)

    for i in range(n_reviews):
        review = reviews[i]
        sentences = sent_tokenize(review)

        for j in reversed(range(len(sentences))):
            sent = sentences[j]
            sentences[j] = sent.strip()

            if sent == '':
                sentences.pop(j)

        reviews[i] = sentences 

In [5]:
rev_list = list(reviews['Text'])
split_sentences(rev_list)

In [None]:
reviews['sent_tokens'] = rev_list

In [None]:
reviews['length'] = reviews['sent_tokens'].map(lambda x: len(x))

find_length = 5
reviews = reviews[reviews['length'] > find_length]

reviews.shape

In [None]:
reviews

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## 리뷰들의 최대 vocab의 수를 5000개로 한정
MAX_FEATURES = 5000

list_sentences_train = reviews['Text']

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(list(list_sentences_train))

list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)

maxlen = 200
X_train = pad_sequences(list_tokenized_train, maxlen)

In [None]:
import gc 
import re
import string
import numpy as np

def load_embedding_matrix(file_name: str) -> dict:
    """
    사전 학습된 glove 임베딩 
    """
    EMBEDDING = r'/Users/not_joon/NLP_exercise/data/glove/glove.twitter.27B.25d.txt'
    embed_size = 25
    
    if file_name=='glove':
        embeddings_idx = dict()

        f = open(EMBEDDING, encoding='utf-8')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_idx[word] = coefs ## 50 dims
        f.close()

        print(f'loaded {len(embeddings_idx)} word vectors')
    
    gc.collect()
    return embeddings_idx



def calculate_sentence_embedding(word_list, emb_idx):
    """
    전체 문장에 대해 단어들의 임베딩 평균값으로 문장 임베딩 계산 
    """
    emb_list = []

    for k in word_list:
        embedding_vec = emb_idx.get(k)

        if embedding_vec is not None:
            if(len(embedding_vec) == 25):
                emb_list.append(list(embedding_vec))
    mean_arr = np.array(emb_list)

    return np.mean(mean_arr, axis=0)



def get_sent_embedding(get_list):
    """
    위에 정의된 함수들을 이용. 문장에 대한 임베딩을 활성화, 전처리 과정을 거침
    """
    sent_emb = []
    n_sentences = len(get_list)

    for i in get_list:
        i = i.lower()
        wL = re.sub("[^\w]", " ", i).split()

        if len(wL) > 0:
            for k in wL:
                if k in string.punctuation:
                    wL.remove(k)
            if len(wL) <= 2:
                continue
        
        else:
            print(f"sentence removed: {i}")
            continue
            
        res = list(calculate_sentence_embedding(wL))
        sent_emb.append(res)

    return np.array(sent_emb)

In [None]:
emb_idx = load_embedding_matrix('glove')

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

size = 5000 
summary = [None] * size 

for r in range(size):
    review = reviews['sent_tokens'].iloc[r]
    encoding = get_sent_embedding(review)

    if len(encoding) > 0:
        clusters = int(np.ceil(len(encoding) ** 0.5))
        kmeans = KMeans(n_clusters=clusters, random_state=42)
        kmeans = kmeans.fit(encoding)

        avg = []
        closest = []

        for j in range(clusters):
            idx = np.where(kmeans.labels_ == j)[0]
            avg.append(np.mean(idx))
        
        closest, _ = pairwise_distances_argmin_min(
            kmeans.cluster_centers_,
            encoding
        )

        ordering = sorted(range(clusters), key=lambda k: avg[k])
        summary[r] = ' '.join([reviews[closest[idx]] for idx in ordering])

        print(f'Done for reviews # = {r}')
    
    else:
        print("not valid")

reviews = reviews[:size]
reviews['PredictedSummary'] = summary
reviews[['Text', 'PredictedSummary']].to_csv('top_5000_summary.csv')