In [2]:
import numpy as np
import pandas as pd
import re
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Data 읽기 및 결측치 처리

In [2]:
df1 = pd.read_csv("winemag-data-130k-v2.csv")
print(df1.info())
df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             129971 non-null  int64  
 1   country                129908 non-null  object 
 2   description            129971 non-null  object 
 3   designation            92506 non-null   object 
 4   points                 129971 non-null  int64  
 5   price                  120975 non-null  float64
 6   province               129908 non-null  object 
 7   region_1               108724 non-null  object 
 8   region_2               50511 non-null   object 
 9   taster_name            103727 non-null  object 
 10  taster_twitter_handle  98758 non-null   object 
 11  title                  129971 non-null  object 
 12  variety                129970 non-null  object 
 13  winery                 129971 non-null  object 
dtypes: float64(1), int64(2), object(11)


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
df1 = df1[df1["variety"].notnull() & df1["price"].notnull()]
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120974 entries, 1 to 129970
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             120974 non-null  int64  
 1   country                120915 non-null  object 
 2   description            120974 non-null  object 
 3   designation            86196 non-null   object 
 4   points                 120974 non-null  int64  
 5   price                  120974 non-null  float64
 6   province               120915 non-null  object 
 7   region_1               101400 non-null  object 
 8   region_2               50292 non-null   object 
 9   taster_name            96479 non-null   object 
 10  taster_twitter_handle  91559 non-null   object 
 11  title                  120974 non-null  object 
 12  variety                120974 non-null  object 
 13  winery                 120974 non-null  object 
dtypes: float64(1), int64(2), object(11)


## "Description" Clean & lemmatize

In [4]:
def cleaning(argStr):
    argStr = argStr.lower()
    argStr = re.sub(r'\W',  ' ',argStr)
    argStr = re.sub(r'\s+', ' ',argStr)
    argStr = re.sub(r'\s$', '' ,argStr)
    return(argStr)

In [5]:
# df['features']=
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
lstCleanSent=[]
for sentence in df1.description:
    strWord = cleaning(sentence)
    strWord = re.sub('fruity', 'fruit', strWord)
    lstWord = nltk.word_tokenize(strWord)
    lstWord = [x.lower() for x in lstWord]
    lstWord = [x for x in lstWord if x not in stopwords.words('english')]
    lstWord = [stemmer.stem(x) for x in lstWord]
    lstWord = [lemmatizer.lemmatize(x) for x in lstWord]
    lstCleanSent.append(" ".join(lstWord))
print(len(lstCleanSent))

KeyboardInterrupt: 

In [9]:
df1 = pd.read_pickle('winengram.pkl')

In [10]:
df1.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,lemma,feature,NGram,featureNGram
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,ripe fruit wine smooth still structur firm tan...,ripe fruit wine firm tannin juici red berri f...,acid-alreadi alreadi-drinkabl although-certain...,berri-fruit firm-tannin fruit-wine readi-drin...
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,tart snappi flavor lime flesh rind domin green...,green crisp acid wine,acid-underscor crisp-acid domin-green flavor-l...,crisp-acid
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,pineappl rind lemon pith orang blossom start a...,lemon aroma palat note,aroma-palat astring-semidri bit-opul blossom-s...,
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,much like regular bottl 2012 come across rathe...,wine,2012-come across-rather bottl-2012 characteris...,
5,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,blackberri raspberri aroma show typic navarran...,blackberri raspberri aroma green herb full bo...,acid-spici aroma-show blackberri-raspberri bod...,full-bodi


## Description에서 유용한 단어 선별(Tfidf)

In [11]:
len(df1[df1["lemma"].str.contains("come")])

7275

In [12]:
lstStopWord = ['also','charact','come','still','drink','years','feel', 'finish',
               'flavor','like','made','show','hint','offer','give', 'good', 'structur']
vectMy = TfidfVectorizer(max_features = 50, stop_words = lstStopWord )

X = vectMy.fit_transform(df1['lemma']).toarray()

print(vectMy.get_feature_names())

['acid', 'age', 'appl', 'aroma', 'balanc', 'berri', 'black', 'blackberri', 'blend', 'bodi', 'bright', 'cabernet', 'cherri', 'citru', 'concentr', 'crisp', 'dark', 'dri', 'firm', 'fresh', 'fruit', 'full', 'green', 'herb', 'juici', 'lemon', 'light', 'miner', 'nose', 'note', 'oak', 'palat', 'peach', 'pepper', 'plum', 'raspberri', 'red', 'rich', 'ripe', 'soft', 'spice', 'sweet', 'tannin', 'textur', 'touch', 'vanilla', 'well', 'white', 'wine', 'year']


In [13]:
lstFeature = vectMy.get_feature_names()

def makeFeature(argStrD, arLstFt):
    strRes = ""
    lstDesc = argStrD.split(" ")
    for x in lstDesc:
        if x in lstFeature:
            strRes += " " + x
    return strRes

makeFeature(df1.loc[100, "lemma"], lstFeature)

' fresh appl lemon full bodi rich balanc white'

In [14]:
df1["feature"]=df1["lemma"].apply(lambda x : makeFeature(x, lstFeature))
df1.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,lemma,feature,NGram,featureNGram
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,ripe fruit wine smooth still structur firm tan...,ripe fruit wine firm tannin juici red berri f...,acid-alreadi alreadi-drinkabl although-certain...,berri-fruit firm-tannin fruit-wine readi-drin...
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,tart snappi flavor lime flesh rind domin green...,green crisp acid wine,acid-underscor crisp-acid domin-green flavor-l...,crisp-acid
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,pineappl rind lemon pith orang blossom start a...,lemon aroma palat note,aroma-palat astring-semidri bit-opul blossom-s...,
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,much like regular bottl 2012 come across rathe...,wine,2012-come across-rather bottl-2012 characteris...,
5,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,blackberri raspberri aroma show typic navarran...,blackberri raspberri aroma green herb full bo...,acid-spici aroma-show blackberri-raspberri bod...,full-bodi


## Lemmatize로 bigram 만들기(ITfidf)

In [None]:
def makeNGram(argLstSent, argNum = 3):
    vectMy = CountVectorizer(ngram_range=(argNum, argNum), analyzer="word")
    
    lstNGram = vectMy.fit(argLstSent).get_feature_names()
    # ar2NGramCnt = vectMy.transform(argLstSent).toarray()        # 결과는 array of array.
    # lstNGramCnt = list(ar2NGramCnt[0])                          # 단순한 리스트로 변환.
    return(lstNGram)

lstNGram = df1["lemma"].apply(lambda x : makeNGram([x], 2))

In [None]:
df1.loc[1, "lemma"]

In [None]:
lstNGram[1]

In [None]:
lstNGram2 = lstNGram.apply(lambda x : [re.sub(" ", "-", y) for y in x])
lstNGram3 = lstNGram2.apply(lambda x : " ".join(x))
lstNGram3

In [None]:
df1["NGram"] = lstNGram3

## lemmatize 에서 유용한 단어 선별(Tfidf)

In [None]:
lstStopWord = []
vectMy = TfidfVectorizer(max_features = 50, token_pattern=r"(?u)\b\w\w+-\w+\b", stop_words = lstStopWord )

X = vectMy.fit_transform(df1['NGram']).toarray()

print(vectMy.get_feature_names())

In [None]:
lstFeatureNGram = vectMy.get_feature_names()

def makeFeatureNGram(argStrLemma, argLstFt):
    strRes = ""
    for x in argLstFt:
        x = re.sub("-", " ", x)
        if x in argStrLemma:
            strRes += " " + re.sub(" ", "-", x)
    return strRes
print(df1.loc[1, "lemma"])
makeFeatureNGram(df1.loc[1, "lemma"], lstFeatureNGram)

In [None]:
df1["featureNGram"]=df1["lemma"].apply(lambda x : makeFeatureNGram(x, lstFeatureNGram))
df1.head()

In [None]:
df1["featureNGram"].value_countss()

In [None]:
df1.to_pickle("winengram.pkl")

In [3]:
df1 = pd.read_pickle("winengram.pkl")
df1.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,lemma,feature,NGram,featureNGram
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,ripe fruit wine smooth still structur firm tan...,ripe fruit wine firm tannin juici red berri f...,acid-alreadi alreadi-drinkabl although-certain...,berri-fruit firm-tannin fruit-wine readi-drin...
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,tart snappi flavor lime flesh rind domin green...,green crisp acid wine,acid-underscor crisp-acid domin-green flavor-l...,crisp-acid
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,pineappl rind lemon pith orang blossom start a...,lemon aroma palat note,aroma-palat astring-semidri bit-opul blossom-s...,
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,much like regular bottl 2012 come across rathe...,wine,2012-come across-rather bottl-2012 characteris...,
5,5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,blackberri raspberri aroma show typic navarran...,blackberri raspberri aroma green herb full bo...,acid-spici aroma-show blackberri-raspberri bod...,full-bodi


In [4]:
strBase = "ripe fruit wine firm tannin juici red berri"
lstTar = ["ripe",  "green"]

def isInStr(argLstTar, argStrBase):
    if any ((x in argStrBase) for x in argLstTar):
        return True
    else:
        return False
    
isInStr(lstTar, strBase)

True

In [5]:
lstFind = {"feature": ["ripe", "green", "acid"], "price":[30, 50]}

def cntFeature(argLstTarFt, argStrFt):
    cnt=0
    for x in argLstTarFt:
        if x in argStrFt:
            cnt += 1
    return cnt

print(df1.loc[2, "feature"])
cntFeature( lstFind["feature"], df1.loc[2, "feature"])

 green crisp acid wine


2

In [12]:
lstFind = {"feature": ["ripe", "green"], "price":[30, 50]}
def find(argDicTarget, argDfData):
    return (df1[  argDfData["feature"].apply(lambda x : isInStr(argDicTarget["feature"], x))
                & argDfData["price"].apply(lambda x : int(x) > argDicTarget["price"][0])
                & argDfData["price"].apply(lambda x : int(x) < argDicTarget["price"][1])
               ])

find(lstFind, df1).head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,lemma,feature,NGram,featureNGram
121,121,US,The vineyard is one of the better Chardonnay s...,Stuhlmuller Vineyard,92,36.0,California,Alexander Valley,Sonoma,,,Matrix 2007 Stuhlmuller Vineyard Chardonnay (A...,Chardonnay,Matrix,vineyard one better chardonnay sourc alexand v...,ripe wine oak vanilla balanc rich,add-butter alexand-valley balanc-classic bette...,
125,125,South Africa,Etienne Le Riche is a total Cabernet specialis...,Cabernet Sauvignon Reserve,91,45.0,Stellenbosch,,,Roger Voss,@vossroger,Le Riche 2003 Cabernet Sauvignon Reserve Caber...,Cabernet Sauvignon,Le Riche,etienn le rich total cabernet specialist tini ...,rich cabernet wine cabernet fruit wine black ...,000-case 2003-fruit across-wine best-sen black...,black-plum ripe-fruit
126,126,France,Mid-gold color. Pronounced and enticing aromas...,Vendages Tardives,91,48.0,Alsace,Alsace,,,,Pierre Sparr 2007 Vendages Tardives Gewurztram...,Gewürztraminer,Pierre Sparr,mid gold color pronounc entic aroma ripe citru...,aroma ripe citru fruit note touch sweet spice...,acid-medium aroma-ripe bodi-eleg citru-stone c...,fresh-acid medium-bodi stone-fruit
173,173,US,This wine is bursting with gorgeous cherry fru...,,91,38.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Le Cadeau 2014 Pinot Noir (Willamette Valley),Pinot Noir,Le Cadeau,wine burst gorgeou cherri fruit forward luscio...,wine cherri fruit tannin ripe acid fruit textur,acid-perfect buoy-upon burst-gorgeou cherri-fr...,cherri-fruit fruit-flavor
184,184,US,This is a typical Sonoma Coast Chard in the br...,,88,35.0,California,Sonoma Coast,Sonoma,,,Anaba 2007 Chardonnay (Sonoma Coast),Chardonnay,Anaba,typic sonoma coast chard bright acid firm ripe...,bright acid firm ripe peach note spice,acid-firm bright-acid buttercream-cinnamon cha...,bright-acid


In [18]:
lstFind = {"feature": ["ripe", "green", "fruit"],"variety" : ['Blend'], "price":[30, 50]}
def find(argDicTarget, argDfData):
    dfTmp = df1[  argDfData["feature"].apply(lambda x : isInStr(argDicTarget["feature"], x))
                & argDfData["variety"].apply(lambda x : isInStr(argDicTarget["variety"], x))
                & argDfData["price"].apply(lambda x : int(x) > argDicTarget["price"][0])
                & argDfData["price"].apply(lambda x : int(x) < argDicTarget["price"][1])
               ].copy()

    dfTmp["tmp"] = dfTmp["feature"].apply(lambda x: cntFeature(argDicTarget["feature"], x))
    
    return(dfTmp.sort_values(by=["tmp", "price", "points"], ascending=[False, True, False]))
    
find(lstFind, df1)[['country', 'description', 'designation','tmp','variety', 'points',
                    'price', 'province', 'region_1', 'taster_name',
                    'title', 'winery']].rename(columns={"region_1":"region"})

Unnamed: 0,country,description,designation,tmp,variety,points,price,province,region,taster_name,title,winery
13418,France,"A big, fruity bold wine, packed with the ripes...",Pêche au Carrelet,3,Bordeaux-style White Blend,90,33.0,Bordeaux,Graves,Roger Voss,Château Haut-Peyrous 2008 Pêche au Carrelet (...,Château Haut-Peyrous
27789,France,"A big, fruity bold wine, packed with the ripes...",Pêche au Carrelet,3,Bordeaux-style White Blend,90,33.0,Bordeaux,Graves,Roger Voss,Château Haut-Peyrous 2008 Pêche au Carrelet (...,Château Haut-Peyrous
45459,France,The estate is within the urban area of Bordeau...,,3,Bordeaux-style Red Blend,93,35.0,Bordeaux,Pessac-Léognan,Roger Voss,Château Picque Caillou 2014 Pessac-Léognan,Château Picque Caillou
115460,France,"Crisp in sliced green-apple fruit, this Champa...",Brut,3,Champagne Blend,90,35.0,Champagne,Champagne,Roger Voss,Thiénot NV Brut (Champagne),Thiénot
117710,Italy,A pretty mineral note appears at the very begi...,Grale,3,Red Blend,87,35.0,Tuscany,Bolgheri,,Le Grascete 2009 Grale (Bolgheri),Le Grascete
...,...,...,...,...,...,...,...,...,...,...,...,...
16927,US,"Dry and intense in tannins, this blend of 50% ...",,1,Rhône-style Red Blend,86,49.0,California,North Coast,Virginie Boone,Honor Winery 2010 Red (North Coast),Honor Winery
31807,Spain,"Mature and sound on the bouquet, but more snap...",Reserva,1,Tempranillo Blend,86,49.0,Northern Spain,Ribera del Duero,Michael Schachner,Resalte 2001 Reserva (Ribera del Duero),Resalte
35756,US,A good but average red wine for drinking now. ...,Estate,1,Red Blend,86,49.0,California,Diamond Mountain District,,A.S. Kiken 2009 Estate Red (Diamond Mountain D...,A.S. Kiken
25073,France,"Barrel aged for 12 months, this 95% Merlot win...",,1,Bordeaux-style Red Blend,85,49.0,Bordeaux,Saint-Émilion,Roger Voss,Château Valade 2012 Saint-Émilion,Château Valade


In [63]:
df1.columns

Index(['Unnamed: 0', 'country', 'description', 'designation', 'points',
       'price', 'province', 'region_1', 'region_2', 'taster_name',
       'taster_twitter_handle', 'title', 'variety', 'winery', 'lemma',
       'feature', 'NGram', 'featureNGram'],
      dtype='object')

In [None]:
Unnamed: 0  taster_twitter_handle region_2 features