In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from nltk.tag import pos_tag
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer


In [3]:
df= pd.read_csv("really_cleaned_wine_data.csv")

In [4]:
df['variety'] = df['variety'].str.lower()

In [5]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('R'):
        return 'r'
    elif treebank_tag.startswith('J'):
        return 'a'
    else:
        return 'n'

def cleaning(sentence):


    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers


    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '') ## remove punctuation

    tokenized_sentence = word_tokenize(sentence) ## tokenize
    stop_words = set(stopwords.words('english')) ## define stopwords

    tokenized_sentence_cleaned = [ ## remove stopwords
        w for w in tokenized_sentence if not w in stop_words
    ]

    lemmatized = [
        WordNetLemmatizer().lemmatize(word, pos=get_wordnet_pos(pos_tag))
        for word, pos_tag in pos_tag(tokenized_sentence_cleaned)
    ]


    cleaned_sentence = ' '.join(word for word in lemmatized)

    return cleaned_sentence

In [6]:
df["processed_description"] = df["description"].apply(cleaning)

In [20]:
# Aromas
fruity = ["apple", 'apricot', 'berry', 'berry fruit', 'black cherry', 'black currant', 'black fruit', 'blackberry', 'blueberry', 'cassis', 'cherry', "citrus", 'cranberry', 'currant', 'fruit', 'fruity', 'grape', 'grapefruit', 'lemon', 'lime', 'melon', 'orange', 'pineapple', 'raspberry', 'red berry', 'red fruit', "peach", "pear", "raspberry", "blueberry", "strawberry", "orange", "lime", "plum", "fruity", "fruit flavor", "tangerine"]
spicy = ['anise', 'black pepper', 'cinnamon', 'pepper', 'licorice', 'spice', 'spicy', 'clove', 'peppery']
herbal = ['green', 'herb', 'herbs', 'herbal', 'violet']
oaky = ['almond', 'vanilla', 'cedar', 'earth', 'earthy', 'leather', 'oak', 'wood', 'toasty', 'oaky', 'caramel', 'cola', 'roast', 'meat', 'smoky', 'smoke', 'tobacco', 'bake', 'savory']
chocolate_coffee = ['chocolate', 'coffee', 'espresso', 'mocha']
floral =  ['floral', 'flower', 'blossom', 'violet']
# Body
light_bodied = ["light", "lightly", "smooth"]
medium_bodied = ['medium', 'mediumbodied', 'juicy']
full_bodied = ['fullbodied', 'full', 'dense', 'rich', 'richness', 'bold', 'thick', 'chewy', 'lush', 'syrah']
# Texture
soft_texture = ['soft', 'mature']
creamy_texture = ['creamy']
structured_texture = ['structure', 'tannic']
silky_texture = ['silky', 'velvety', 'lush']
# Sweetness
sweet = ['sweet', 'honey', 'candy', 'jammy', 'sweetness', 'sugar']
dry = ['acid', 'acidity', 'dry', 'off-dry', 'tart', 'mineral', 'crisp', 'bitter', 'minerality', 'refresh', 'tangy', 'zest', 'zesty', 'bright', 'stone']


In [21]:
def encode_columns(row):
    # Sweet or dry
    sweet_words = [keyword for keyword in row['processed_description'] if keyword in sweet]
    dry_words = [keyword for keyword in row['processed_description'] if keyword in dry]

    if len(sweet_words) > len(dry_words):
        sweet_wine, dry_wine = 1, 0
    if len(dry_words) > len(sweet_words):
        sweet_wine, dry_wine = 0, 1
    else:
        sweet_wine, dry_wine = 0, 0

    # Aromas

    fruity_aroma = 1 if any(keyword in row['processed_description'] for keyword in fruity) else 0
    spicy_aroma = 1 if any(keyword in row['processed_description'] for keyword in spicy) else 0
    herb_aroma = 1 if any(keyword in row['processed_description'] for keyword in herbal) else 0
    oak_aroma = 1 if any(keyword in row['processed_description'] for keyword in oaky) else 0
    chocolate_aroma = 1 if any(keyword in row['processed_description'] for keyword in chocolate_coffee) else 0
    floral_aroma = 1 if any(keyword in row['processed_description'] for keyword in floral) else 0


    # Encoding body_of_the_wine based on keywords
    body_light = 1 if any(keyword in row['processed_description'] for keyword in light_bodied) else 0
    body_medium = 1 if any(keyword in row['processed_description'] for keyword in medium_bodied) else 0
    body_full = 1 if any(keyword in row['processed_description'] for keyword in full_bodied) else 0

    # Encoding the texture

    soft_tex = 1 if any(keyword in row['processed_description'] for keyword in soft_texture) else 0
    creamy_tex = 1 if any(keyword in row['processed_description'] for keyword in creamy_texture) else 0
    structured_tex = 1 if any(keyword in row['processed_description'] for keyword in structured_texture) else 0
    silky_tex = 1 if any(keyword in row['processed_description'] for keyword in silky_texture) else 0

    return pd.Series([sweet_wine, dry_wine, fruity_aroma, spicy_aroma, herb_aroma, oak_aroma, floral_aroma,chocolate_aroma, body_light, body_medium, body_full, soft_tex, creamy_tex, structured_tex, silky_tex])

# Apply the encoding function to create new columns
df[['dry_wine','sweet_wine','fruity_aroma', 'spicy_aroma', 'herb_aroma', 'oak_aroma', 'chocolate_aroma', 'floral_aroma','body_light', 'body_medium', 'body_full', 'soft_tex', 'creamy_tex', 'structured_tex','silky_tex']] = df.apply(encode_columns, axis=1)


In [9]:
red_wines = ['portuguese red','tinta de toro', 'austrian red blend','tinta cao','tinto del pais','sptburgunder','cabernet sauvignonmerlotshiraz','tannatmerlot',
'malbeccarmnre','bastardo','malbec blend','syrahmerlot','touriga franca','merlotcabernet','carienagarnacha','cabernet sauvignoncarmenre','vidadillo','malbeccabernet franc','merlottannat','coda di volpe','blaufrnkisch','shirazmalbec','freisa',
 'provence red blend','duras','pinotagemerlot', 'cabernet sauvignonmalbec','franconia','rufete','tempranillosyrah','traminer','carignansyrah','nebbiolo','aglianico','cabernet francmerlot',
 'cabernet sauvignonshiraz','gamay','malbectempranillo','rhnestyle red blend','tintilia','vinho','aragons','sangiovese grosso','garnachacabernet','garnachamonastrell','kotsifali','primitivo',
 'shiraz','shirazroussanne','nerello cappuccio','cabernetmalbec','colorino','syrahmourvdre','merlotargaman','bobal','syrahviognier','counoise','grenache blend','syrahbonarda',
 'vespolina','mavrodaphne','pinot noirgamay', 'lambrusco','red blend','bobalcabernet sauvignon','rosado','cabernet blend','syrahcabernet sauvignon', 'st vincent',
 'mavrotragano','garnachasyrah','grenacheshiraz','gsm','dornfelder', 'raboso','uva di troia','malbec','tinta negra mole','grignolino','petite sirah','pignolo','castelo',
 'mavrokalavryta','centesimino','malbectannat','incrocio manzoni','tannatsyrah','chelois','pallagrello','barbera','carmenresyrah','carinea','pinot nero',
 'prugnolo gentile','touriga nacional','claret','poulsard','jaen','mandilaria','tempranillo','zweigelt','bombino nero','rebo','mataro','maturana',
 'tinta madeira','tinta amarela','cabernet franc','shirazcabernet','tannatcabernet franc','shirazgrenache','cabernet sauvignoncabernet franc','durella','tinta barroca','merlotsyrah',
 'prieto picudo', 'shirazmourvdre','syrah','tannat','graciano','passerina','malbecpetit verdot','teroldego rotaliano','barberanebbiolo','garnachatempranillo','nero davola',
 'syrahgrenache','lagrein','tinto fino','aligot','carmenre','malbeccabernet sauvignon','nerello mascalese','pugnitello',
 'lambrusco di sorbara','tempranillomerlot','canaiolo','shirazcabernet sauvignon','alicante','grolleau','cabernet sauvignonsyrah','marselan','pinotage','brachetto','merlotcabernet sauvignon',
 'nasco','frappato','negroamaro','shiraztempranillo','montepulciano','merlotmalbec','sangiovesesyrah','pinot noirsyrah','monastrellsyrah','nascetta','garnachacariena','carcajolu',
 'negrette','cabernet','trollinger','cabernet sauvignon','casavecchia','garnacha blend','molinara','monastrell','vermentino nero','bovale','cabernet merlot','sirica',
 'tinta roriz','marzemino','merlot','xinomavro','syrahmalbec','carignan','cabernet sauvignontempranillo','saperavi','alicante bouschet','carignangrenache','merlotgrenache',
 'trepat','petite verdot','syrahtempranillo','zinfandel','tinta fina','sangiovese cabernet','pigato','bonarda','corvina rondinella molinara','baga','grenachemourvdre','chambourcin',
 'petit verdot','baco noir','sangiovesecabernet sauvignon','shirazviognier','aragonez','cabernet franclemberger','charbono','babosa negro','groppello','dolcetto',
 'meritage','trincadeira','argaman','merlotshiraz','bagatouriga nacional','nero di troia','cabernetsyrah','norton','loin de loeil','cinsault','merlotcabernet franc','durif','cabernetshiraz',
 'agiorgitiko','piedirosso','susumaniello','aleatico','cabernet pfeffer','cabernet franccarmenre','cabernet franccabernet sauvignon','gaglioppo','listn negro','malbeccabernet','ciliegiolo',
 'malbecsyrah','malbecbonarda','nielluciu','refosco','ruch','mavroudi','mondeuse','prunelard','grenachecarignan','carignano','timorasso','corvina','sangiovese','tinta francisca',
 'cabernet sauvignonmerlot','tempranillocabernet sauvignon','tempranilloshiraz','pinot noir','touriga nacionalcabernet sauvignon','cabernet sauvignonbarbera','carignane','garnacha tintorera','malvasia nera','syrahpetite sirah',
 'cabernet sauvignonsangiovese','gamay noir','bordeauxstyle red blend','grenachesyrah','syrahcabernet franc','tempranillogarnacha','magliocco',
 'tinto velasco','frburgunder','syrahcarignan','cannonau','cesanese','monastrellpetit verdot','syrahcabernet','tinta del pais','mourvdre','cabernet sauvignon grenache','syrahgrenacheviognier','tempranillo blanco','grenache',
 'pallagrello nero','lambrusco grasparossa','cesanese daffile','schiava','cabernet francmalbec','grenache noir','menca','merlotpetite verdot', 'parraleta','syrahpetit verdot','mourvdresyrah',
 'mauzac','tannatcabernet','trousseau','carmenrecabernet sauvignon','tempranillo blend','sagrantino','mazuelo','malbecmerlot', 'francisa']

In [10]:
white_wines = ['clairette','muskat', 'malagousia','pinot blancviognier','neuburger','garnacha','pas','torbato','forcall','apple','zibibbo','gelber traminer','elbling','roter veltliner',
 'gros and petit manseng','chardonnaysauvignon','orange muscat','assyrtiko','xarello','roviello','pinot grigiosauvignon blanc','glera','anto vaz','inzolia',
'gragnano','madeleine angevine','alvarelho','verdejoviura','tocai','trousseau gris','albana','cayuga','muscatel',
 'moscatel','gros manseng', 'macabeo','gouveio','mantonico','white riesling','jampal','auxerrois','rkatsiteli', 'perricone','rieslingchardonnay','muscat of alexandria','sauvignon blancverdejo',
 'malvasiaviura','pinot blancchardonnay','pinot bianco','moscato di noto','abouriou','vidal','ondenc','colombard','chardonnaysauvignon blanc',
 'chenin blancsauvignon blanc','chardonel','vernaccia','thrapsathiri','airen','dafni','hondarrabi zuri','blauburgunder','muscat dalexandrie','alsace white blend','alvarinho','savagnin',
 'furmint','white port','provence white blend','asprinio','manzoni','vignoles','moscatel grado','tokay','sideritis','meseguera','braucol','pinotchardonnay',
 'roussannemarsanne','marawi','monica','vitovska','malvasia','pallagrello bianco','chardonnayriesling','grillo','viognier','alfrocheiro','vidal blanc','schwartzriesling','nosiola',
 'assyrtico','trebbiano','smillon','cortese','melon','sylvaner','catalanesca','nero davola','pinot auxerrois','gewrztraminer','cercial',
 'muscadelle','favorita','sauvignon blancchardonnay','gros plant','grenache blanc', 'gelber muskateller', 'bical','moscato giallo','malagouzia','muscat canelli','semillonchardonnay',
 'chardonnaypinot blanc','altesse','muscat blanc','chardonnaysemillon','silvaner','viogniergrenache blanc','verdicchio','chardonnaypinot gris','athiri',
 'cococciola','loin de loeil','colombardsauvignon blanc','sauvignonsmillon','orangetraube','chasselas','morillon','ferno pires','rosenmuskateller','sauvignon gris',
 'macabeochardonnay', 'uvalino','verdeca','premsal','zierfandlerrotgipfler','ugni blanccolombard','folle blanche','blanc du bois','grner veltliner',
 'semillon','verduzzo','muscat blanc petits grains','pri blanc','austrian white blend','grauburgunder','tokay pinot gris',
 'roditismoschofilero','garganega','roussanne','greco bianco','ribolla gialla','viuraverdejo','muscadel','grenache gris', 'pinot grigio','fer servadou',
 'pinot meunier','picapoll','viognierchardonnay','pinot grisgewrztraminer','chancellor','colombardugni blanc','sauvignon musqu','smling',
 'malagouziachardonnay','petit manseng','malvasia istriana','grechetto','muskat ottonel','zierfandler','cdega do larinho','teroldego','palomino','st laurent','erbaluce',
 'maria gomes','loureiro','siegerrebe','carricante','marsanneviognier','ansonica','nuragus','verdosilla','chardonnay weissburgunder','trajadura','debit',
 'malvasia fina','plyto','mllerthurgau','viogniervaldigui','pinot blancpinot noir','aligot','bual','malvasia bianca','sauvignon',
 'roussannegrenache blanc','johannisberg riesling','romorantin','pecorino','avesso','trebbiano di lugana','verdelho','sciaccerellu','alvarinhochardonnay','sacy','pignoletto',
 'rolle','bombino bianco','pinot blanc','friulano','paralleda','cerceal','diamond','roter traminer','verdejo','catarratto','edelzwicker','touriga nacional blend',
 'treixadura','mansois','valvin muscat','malvasia di candia','trebbiano spoletino','ugni blanc','pansa blanca','malvar','chardonnay',
 'bordeauxstyle white blend','vermentino','savatiano','viogniergewrztraminer','sauvignon blancsemillon','tinta mida','grecanico','viura','arneis','doa blanca',
 'garnacha blanca','siria','albarossa','moscato','caprettone','chenin blancchardonnay','souso','sercial','azal','aidani','moschofilero','lemberger','viogniermarsanne',
 'viosinho','albario','verdil','verdejosauvignon blanc','petit courbu','white blend','robola','chenin blanc','insolia','moscatel de alejandra','weissburgunder',
 'gewrztraminerriesling','jacqure','vespaiolo','picpoul','encruzado','fum blanc','greco','merseguerasauvignon blanc','muskateller','roussanneviognier',
 'roditis','pinot gris','biancale','albanello','rivaner','rotgipfler','portuguese white','macabeomoscatel','chenin blancviognier','godello','silvanertraminer',
 'tocai friulano','vilana','chardonnayalbario','sauvignon blancassyrtiko','seyval blanc','sauvignon blanc','touriga','sauvignon blancchenin blanc','semillonsauvignon blanc',
 'loureiroarinto','rhnestyle white blend','fiano','chardonnayviognier','falanghina','maria gomesbical','viurachardonnay','biancolella','picolit',
 'viogniermarsanneroussanne','viognierroussanne','rabigato','riesling','verduzzo friulano ','torontel','traminette','arinto','marsanneroussanne','tinta del toro','piquepoul blanc','marsanne','rieslaner','torronts','welschriesling','rosato','kerner','turbiana']

In [11]:
rose_wines = ['rosato', 'portuguese ros', 'ros', 'marquette']

In [12]:
sparkling_wines = ['champagne blend','veltliner','lambrusco','portuguese sparkling','symphony','glera','lambrusco salamino','sparkling blend', 'prosecco']

In [13]:
dessert_wines = ['port','muscat','black monukka', 'late harvest','white port','sherry','moscato rosa', 'scheurebe','black muscat','morio muskat','sauternes','moscato',
 'moscadello','madeira blend','muscat dalexandrie','mission','muscat hamburg','souzao','torronts','valdigui','moscatel roxo','muscadine',
 'ice wine','moschofilerochardonnay','orange muscat','jacquez','pedro ximnez']

In [14]:
import re
from unidecode import unidecode

def clean_strings(input_list):

    # Step 1 & 2: Lower case each string and remove special characters
    cleaned_list = [re.sub(r'[^a-zA-Z0-9\s]', '', item.lower()) for item in input_list]

    # Step 3: Replace non-standard English letters with their standard equivalents
    cleaned_list = [unidecode(item) for item in cleaned_list]

    return cleaned_list

In [15]:
df['variety'] = clean_strings(df['variety'])

In [22]:
import numpy as np

conditions = [
    df['variety'].isin(red_wines),
    df['variety'].isin(white_wines),
    df['variety'].isin(sparkling_wines),
    df['variety'].isin(rose_wines),
    df['variety'].isin(dessert_wines)
]

# Define corresponding values
values = ['red', 'white', 'sparkling', 'rose', 'dessert']

# Create a new column based on the conditions
df['wine_type'] = np.select(conditions, values, default='Other')

In [23]:
len_rose = (df['wine_type'] == 'rose').sum()
len_red = (df['wine_type'] == 'red').sum()
len_white = (df['wine_type'] == 'white').sum()
len_sparkling = (df['wine_type'] == 'sparkling').sum()
len_dessert = (df['wine_type'] == 'dessert').sum()

print(f'The number of rose wines is {len_rose}.')
print(f'The number of red wines is {len_red}.')
print(f'The number of white wines is {len_white}.')
print(f'The number of sparkling wines is {len_sparkling}.')
print(f'The number of dessert wines is {len_dessert}.')

The number of rose wines is 1166.
The number of red wines is 33211.
The number of white wines is 18301.
The number of sparkling wines is 1998.
The number of dessert wines is 674.


In [24]:
# Saving the expanded dataframe as CSV

df.to_csv('data/expanded_dataframe.csv')

In [25]:
df

Unnamed: 0,country,description,points,price,province,region,title,variety,winery,processed_description,...,chocolate_aroma,floral_aroma,body_light,body_medium,body_full,soft_tex,creamy_tex,structured_tex,silky_tex,wine_type
0,US,The vineyard is one of the better Chardonnay s...,92,36.0,California,"Alexander Valley, Sonoma",Matrix 2007 Stuhlmuller Vineyard Chardonnay (A...,chardonnay,Matrix,vineyard one good chardonnay source alexander ...,...,0,0,0,0,1,0,0,0,0,white
1,US,Defines Rockpile Zinfandel in intensity of fru...,92,39.0,California,"Rockpile, Sonoma",Mauritson 2007 Rockpile Cemetary Vineyard Zinf...,zinfandel,Mauritson,defines rockpile zinfandel intensity fruit hig...,...,0,0,0,0,1,0,0,0,0,red
2,US,This sophisticated wine is mostly Cabernet Sau...,92,45.0,California,"Napa Valley, Napa",Silverado 2006 Cabernet Sauvignon (Napa Valley),cabernet sauvignon,Silverado,sophisticated wine mostly cabernet sauvignon a...,...,1,0,0,0,1,0,0,1,0,red
3,US,Give this young Cab time in the cellar to come...,91,78.0,California,"Napa Valley, Napa",Clark-Clauden 2007 Cabernet Sauvignon (Napa Va...,cabernet sauvignon,Clark-Clauden,give young cab time cellar come around right f...,...,0,0,0,0,0,0,0,1,0,red
4,US,"Thick and brooding, this dark, sweetly tannic ...",91,85.0,California,"Spring Mountain District, Napa",Terra Valentine 2013 K Block Cabernet Sauvigno...,cabernet sauvignon,Terra Valentine,thick brood dark sweetly tannic wine also offe...,...,0,0,0,0,1,0,0,1,0,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55345,Canada,"A brilliant blend of 61% Viognier, 21% Roussan...",93,35.0,British Columbia,"Okanagan Valley,",Le Vieux Pin 2012 Ava White (Okanagan Valley),rhnestyle white blend,Le Vieux Pin,brilliant blend viognier roussanne marsanne im...,...,0,0,1,0,0,0,0,0,0,white
55346,Canada,From a vintage that pushes the fruit out in fr...,93,85.0,British Columbia,"Okanagan Valley,",Le Vieux Pin 2012 Equinoxe Syrah (Okanagan Val...,syrah,Le Vieux Pin,vintage push fruit front load tart tangy black...,...,0,0,0,0,0,0,0,0,0,red
55347,Canada,The winery is perhaps best known for its Merlo...,87,20.0,British Columbia,"Okanagan Valley,",Tinhorn Creek 2012 Merlot (Okanagan Valley),merlot,Tinhorn Creek,winery perhaps best known merlot do overtly oa...,...,0,1,0,0,1,0,0,0,0,red
55348,Canada,A delicious though somewhat reserved wine with...,89,45.0,Ontario,"Niagara Peninsula,",Henry of Pelham 2011 Icewine Vidal (Niagara Pe...,vidal,Henry of Pelham,delicious though somewhat reserved wine note l...,...,0,0,0,0,1,0,0,0,0,white
