In [2]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords


In [32]:
data = pd.read_csv('data.csv')

stop = stopwords.words('english')
x = data.description
y = data.points

In [33]:
corpus = []
IGNORE = False
for i in range(len(x)):
    if IGNORE == False:
        review = x[i].split(" ")
        review = [word for word in review if word not in stop]
        review = ' '.join(review)
        corpus.append(review)
    else:
        review = [i].split(" ")
        review = [word for word in review]
        review = ' '.join(review)
        corpus.append(review)    


In [34]:
corpus

['This tremendous 100% varietal wine hails Oakville aged three years oak. Juicy red-cherry fruit compelling hint caramel greet palate, framed elegant, fine tannins subtle minty tone background. Balanced rewarding start finish, years ahead develop nuance. Enjoy 2022–2030.',
 'Ripe aromas fig, blackberry cassis softened sweetened slathering oaky chocolate vanilla. This full, layered, intense cushioned palate, rich flavors chocolaty black fruits baking spices. A toasty, everlasting finish heady ideally balanced. Drink 2023.',
 'Mac Watson honors memory wine made mother tremendously delicious, balanced complex botrytised white. Dark gold color, layers toasted hazelnut, pear compote orange peel flavors, reveling succulence 122 g/L residual sugar.',
 "This spent 20 months 30% new French oak, incorporates fruit Ponzi's Aurora, Abetina Madrona vineyards, among others. Aromatic, dense toasty, deftly blends aromas flavors toast, cigar box, blackberry, black cherry, coffee graphite. Tannins polis

In [35]:
df = pd.DataFrame({"Description": corpus, "Points":y})

df

Unnamed: 0,Description,Points
0,This tremendous 100% varietal wine hails Oakvi...,96
1,"Ripe aromas fig, blackberry cassis softened sw...",96
2,Mac Watson honors memory wine made mother trem...,96
3,"This spent 20 months 30% new French oak, incor...",96
4,"This top wine La Bégude, named highest point v...",95
...,...,...
150925,Many people feel Fiano represents southern Ita...,91
150926,"Offers intriguing nose ginger, lime floral ele...",91
150927,This classic example comes cru vineyard called...,91
150928,"A perfect salmon shade, scents peaches, cherri...",90


In [36]:
bins = [80, 85, 90, 95, 100]
labels = [1, 2, 3, 4]


df['Points_Class'] = pd.cut(df['Points'], bins=bins, labels=labels, include_lowest=True)

df

Unnamed: 0,Description,Points,Points_Class
0,This tremendous 100% varietal wine hails Oakvi...,96,4
1,"Ripe aromas fig, blackberry cassis softened sw...",96,4
2,Mac Watson honors memory wine made mother trem...,96,4
3,"This spent 20 months 30% new French oak, incor...",96,4
4,"This top wine La Bégude, named highest point v...",95,3
...,...,...,...
150925,Many people feel Fiano represents southern Ita...,91,3
150926,"Offers intriguing nose ginger, lime floral ele...",91,3
150927,This classic example comes cru vineyard called...,91,3
150928,"A perfect salmon shade, scents peaches, cherri...",90,2


In [37]:
p1=len(df[df['Points_Class'] == 1])/len(df)
p2=len(df[df['Points_Class'] == 2])/len(df)
p3=len(df[df['Points_Class'] == 3])/len(df)
p4=len(df[df['Points_Class'] == 4])/len(df)

print(p1)
print(p2)
print(p3)
print(p4)


0.23592393824951965
0.5504869807195388
0.20520771218445638
0.008381368846485125


In [42]:
from nltk.tokenize import word_tokenize
from collections import Counter

In [43]:
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    return tokens

bag_of_words_per_class = {}

In [49]:
for points_class in df['Points_Class'].unique():
    class_df = df[df['Points_Class'] == points_class]
    class_descriptions = ' '.join(class_df['Description'])
    class_tokens = tokenize_text(class_descriptions)
    bag_of_words_per_class[points_class] = Counter(class_tokens)




In [50]:
for points_class, bag_of_words in bag_of_words_per_class.items():
    print(f"Points Class {points_class} Bag of Words:")
    print(bag_of_words)
    print()

Points Class 4 Bag of Words:
Counter({'wine': 1272, 'the': 722, 'it': 586, 'fruit': 582, 'flavors': 533, 'this': 492, 'tannins': 437, 'years': 378, 'black': 368, 'a': 321, 'rich': 311, 'acidity': 308, 'drink': 291, 'cabernet': 280, 'great': 271, 'ripe': 253, 'finish': 250, 'chocolate': 237, 'spice': 235, 'cherry': 227, 'dark': 222, 'structure': 219, 'sweet': 209, 'complex': 200, 'oak': 196, 'age': 190, 'fruits': 189, 'dense': 184, 'now': 180, 'dry': 175, 'shows': 174, 'barrel': 173, 'vintage': 164, 'yet': 161, 'blackberry': 157, 'there': 156, 'richness': 155, 'aromas': 152, 'concentrated': 152, 'palate': 150, 'sample': 149, 'long': 146, 'power': 144, 'vineyard': 143, 'notes': 139, 'cassis': 138, 'red': 135, 'aging': 133, 'texture': 132, 'pinot': 129, 'balance': 128, 'best': 128, 'powerful': 128, 'smoky': 128, 'one': 126, 'many': 125, 'delicious': 124, 'fine': 123, 'well': 121, 'cherries': 119, 'intense': 114, 'beautiful': 114, 'balanced': 113, 'also': 112, 'new': 111, 'firm': 111, 'com

In [53]:
bag_of_words_per_class 

{4: Counter({'wine': 1272,
          'the': 722,
          'it': 586,
          'fruit': 582,
          'flavors': 533,
          'this': 492,
          'tannins': 437,
          'years': 378,
          'black': 368,
          'a': 321,
          'rich': 311,
          'acidity': 308,
          'drink': 291,
          'cabernet': 280,
          'great': 271,
          'ripe': 253,
          'finish': 250,
          'chocolate': 237,
          'spice': 235,
          'cherry': 227,
          'dark': 222,
          'structure': 219,
          'sweet': 209,
          'complex': 200,
          'oak': 196,
          'age': 190,
          'fruits': 189,
          'dense': 184,
          'now': 180,
          'dry': 175,
          'shows': 174,
          'barrel': 173,
          'vintage': 164,
          'yet': 161,
          'blackberry': 157,
          'there': 156,
          'richness': 155,
          'aromas': 152,
          'concentrated': 152,
          'palate': 150,
          'sample'