## Tokenize

In [84]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
def tokenize(doc):
    return tokenizer.tokenize(doc)

## To Small letter

In [66]:
# function

def to_small_letter(word: str):
    return word.lower()

## 특수문자 및 외국어 제외

* 알파벳, 그리고 의미가 담겨있을 가능성이 있는 `, ', . 를 제외하고 전부 제외한다.
* 알파벳과 비슷하게 생겼지만 알파벳이 아닌 문자도 삭제된다.

In [67]:
# function

import re
pattern = re.compile("[^a-z.`']")
def except_non_english(pattern, word):
    return pattern.sub('', word)


## Trimming (양 끝의 구두점과 공백 제거)

* Ph.D 등 가운데에 있는 점은 의미가 있는 경우가 많다. 따라서 끝의 점만 삭제한다.

In [68]:
def trim(word: str):
    return word.strip('.').strip(' ')

## 불용어 제거

In [101]:
from nltk.corpus import stopwords
def remove_stopwords(words: [str], custom_stopwords: set):
    stop_words = set(stopwords.words('english')) | custom_stopwords
    return [w for w in words if w not in stop_words]

## spelling 교정

In [70]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correction(word):
    return spell.correction(word)

## 품사 태깅 + Lemmatization

* 품사 태깅과 Lemmatization 은 떼어놓을 수 없는 관계
* 품사를 기반으로 lemmatize 하기 때문


In [71]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('P'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return ''

n = WordNetLemmatizer()
def lemmatize_with_pos(words):
    words_with_pos = nltk.pos_tag(words)

    lemmatized = []
    for w, pos in words_with_pos:
        pos = get_wordnet_pos(pos)
        if pos != '':
            lemmatized.append(n.lemmatize(w, pos))
        else:
            lemmatized.append(w)

    return lemmatized

In [72]:
import pandas as pd

In [87]:
def map(f, iter):
    return [f(e) for e in iter]

import re
pattern = re.compile("[^a-z.`']")

custom_stopwords = {'example'}

def preprocessing(doc):
    words = tokenize(doc)
    words = map(to_small_letter, words)
    words = [except_non_english(pattern, w) for w in words]
    words = map(trim, words)
    words = [w for w in words if len(w) > 2]
    words = remove_stopwords(words, custom_stopwords)
    words = map(correction, words)
    words = lemmatize_with_pos(words)
    words = [w for w in words if len(w) > 2]
    return words

import pandas as pd
df = pd.read_csv('../top_90_by_gender.csv')

df['review'][:20].map(lambda x: preprocessing(x))

0     [get, sample, today, year, old, daughter, thin...
1     [first, impression, test, blotter, hear, alien...
2     [perfume, reminds, best, friend, actually, per...
3     [imagine, trip, foot, fall, face, first, jasmi...
4     [gorgeous, gorgeous, blend, love, scent, fan, ...
5     [test, recent, formulation, ala, shell, former...
6     [last, six, month, i've, hear, good, bad, revi...
7     [honestly, not, smelt, say, safe, blind, buy, ...
8     [i've, recently, discover, perfume, find, new,...
9     [vote, love, love, meant, old, version, new, f...
10    [surprised, much, enjoy, fragrance, much, talk...
11    [pretty, synthetic, smell, scent, ive, find, s...
12    [smell, different, expect, little, light, less...
13    [lightly, sweet, jasmine, spicy, warm, amber, ...
14    [wow, adamant, try, one, use, eat, year, back,...
15                               [alien, banger, thats]
16    [first, full, bottle, purchase, get, perfumes,...
17    [hate, men, fragrance, mean, say, stay, aw

In [124]:
def preprocess_df(df: pd.DataFrame, col_name, stopwords={' '}):
    s = df['tokenized'] = df[col_name].map(lambda doc: map(to_small_letter, tokenize(doc)))
    s = df['only_english'] = s.map(lambda words: map(trim, [except_non_english(pattern, w) for w in words]))
    s = df['longer_than_2_A'] = s.map(lambda words: [w for w in words if len(w) > 2])
    s = df['stopwords_removed'] = s.map(lambda words: remove_stopwords(words, stopwords))
    # s = df['orthography'] = s.map(lambda words: map(correction, words))
    s = df['lemmatizated'] = s.map(lambda words: lemmatize_with_pos(words))\
        .map(lambda words: [w for w in words if len(w) > 2])
    return df



In [127]:
import json
json.load(open('../dataset/stopwords.json', 'r'))


['sur',
 'libre',
 'muse',
 'serge',
 'eau',
 'maison',
 'hermes',
 'greatness',
 'angels',
 'molecule',
 'poison',
 'gio',
 'cedar',
 'sauvage',
 'coco',
 'lab',
 'crystal',
 'alien',
 'mugger',
 '2011',
 'martin',
 'moschino',
 'good',
 'memoirs',
 'est',
 'one',
 'kurkdjian',
 'marly',
 'allure',
 'laurent',
 "l'interdit",
 'chloe',
 'fahrenheit',
 'herrera',
 'arma',
 'versace',
 'tom',
 'guiltier',
 'montage',
 'baccarat',
 'dune',
 'givenchy',
 'nuit',
 'into',
 'boise',
 'girl',
 'jardin',
 'cherub',
 'prada',
 'gucci',
 'byred',
 'explorer',
 'oud',
 'tuscan',
 'parfum',
 'calvin',
 'club',
 'lutes',
 'salt',
 'anthology',
 "l'air",
 'london',
 'giorgio',
 'belle',
 'delia',
 'ombre',
 'christian',
 'extract',
 'bal',
 'paco',
 'encore',
 'agents',
 'and',
 'cabana',
 'molecules',
 'davidoff',
 'yves',
 'desert',
 'homme',
 'clayton',
 'the',
 'tendre',
 'frederic',
 'saint',
 'dior',
 'gig',
 'ravager',
 'income',
 'chanel',
 'grand',
 'dylan',
 "l'imperatrice",
 'must',
 'vie