In [2]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap


In [3]:
data = pd.read_csv('data.csv')

stop = stopwords.words('english')
x = data.description
y = data.points

In [4]:
corpus = []
IGNORE = False
for i in range(len(x)):
    if IGNORE == False:
        review = x[i].split(" ")
        review = [word for word in review if word not in stop]
        review = ' '.join(review)
        corpus.append(review)
    else:
        review = [i].split(" ")
        review = [word for word in review]
        review = ' '.join(review)
        corpus.append(review)    


In [5]:
corpus

['This tremendous 100% varietal wine hails Oakville aged three years oak. Juicy red-cherry fruit compelling hint caramel greet palate, framed elegant, fine tannins subtle minty tone background. Balanced rewarding start finish, years ahead develop nuance. Enjoy 2022–2030.',
 'Ripe aromas fig, blackberry cassis softened sweetened slathering oaky chocolate vanilla. This full, layered, intense cushioned palate, rich flavors chocolaty black fruits baking spices. A toasty, everlasting finish heady ideally balanced. Drink 2023.',
 'Mac Watson honors memory wine made mother tremendously delicious, balanced complex botrytised white. Dark gold color, layers toasted hazelnut, pear compote orange peel flavors, reveling succulence 122 g/L residual sugar.',
 "This spent 20 months 30% new French oak, incorporates fruit Ponzi's Aurora, Abetina Madrona vineyards, among others. Aromatic, dense toasty, deftly blends aromas flavors toast, cigar box, blackberry, black cherry, coffee graphite. Tannins polis

In [6]:
df = pd.DataFrame({"Description": corpus, "Points":y})

df

Unnamed: 0,Description,Points
0,This tremendous 100% varietal wine hails Oakvi...,96
1,"Ripe aromas fig, blackberry cassis softened sw...",96
2,Mac Watson honors memory wine made mother trem...,96
3,"This spent 20 months 30% new French oak, incor...",96
4,"This top wine La Bégude, named highest point v...",95
...,...,...
150925,Many people feel Fiano represents southern Ita...,91
150926,"Offers intriguing nose ginger, lime floral ele...",91
150927,This classic example comes cru vineyard called...,91
150928,"A perfect salmon shade, scents peaches, cherri...",90


In [7]:
bins = [80, 84, 90, 94, 100]
labels = [1, 2, 3, 4]


df['Points_Class'] = pd.cut(df['Points'], bins=bins, labels=labels, include_lowest=True)

df

Unnamed: 0,Description,Points,Points_Class
0,This tremendous 100% varietal wine hails Oakvi...,96,4
1,"Ripe aromas fig, blackberry cassis softened sw...",96,4
2,Mac Watson honors memory wine made mother trem...,96,4
3,"This spent 20 months 30% new French oak, incor...",96,4
4,"This top wine La Bégude, named highest point v...",95,4
...,...,...,...
150925,Many people feel Fiano represents southern Ita...,91,3
150926,"Offers intriguing nose ginger, lime floral ele...",91,3
150927,This classic example comes cru vineyard called...,91,3
150928,"A perfect salmon shade, scents peaches, cherri...",90,2


In [8]:
p1=len(df[df['Points_Class'] == 1])/len(df)
p2=len(df[df['Points_Class'] == 2])/len(df)
p3=len(df[df['Points_Class'] == 3])/len(df)
p4=len(df[df['Points_Class'] == 4])/len(df)

print(p1)
print(p2)
print(p3)
print(p4)


0.1536937653216723
0.6327171536473862
0.19383820314052871
0.019750877890412775


In [9]:
from nltk.tokenize import word_tokenize
from collections import Counter

In [10]:
def tokenize_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha()]
    return tokens

bag_of_words_per_class = {}

In [11]:
for points_class in df['Points_Class'].unique():
    class_df = df[df['Points_Class'] == points_class]
    class_descriptions = ' '.join(class_df['Description'])
    class_tokens = tokenize_text(class_descriptions)
    bag_of_words_per_class[points_class] = Counter(class_tokens)




In [12]:
for points_class, bag_of_words in bag_of_words_per_class.items():
    print(f"Points Class {points_class} Bag of Words:")
    print(bag_of_words)
    print()

Points Class 4 Bag of Words:
Counter({'wine': 2795, 'the': 1612, 'it': 1384, 'fruit': 1299, 'flavors': 1291, 'this': 1081, 'tannins': 994, 'black': 821, 'rich': 814, 'years': 808, 'acidity': 769, 'a': 701, 'ripe': 683, 'drink': 649, 'great': 586, 'cabernet': 585, 'finish': 543, 'spice': 541, 'cherry': 524, 'oak': 506, 'sweet': 502, 'chocolate': 468, 'dark': 449, 'dry': 429, 'structure': 427, 'palate': 421, 'aromas': 417, 'complex': 414, 'now': 410, 'fruits': 408, 'shows': 402, 'age': 381, 'long': 371, 'blackberry': 368, 'one': 362, 'there': 348, 'dense': 342, 'vineyard': 340, 'vintage': 332, 'barrel': 332, 'notes': 331, 'richness': 330, 'yet': 324, 'delicious': 321, 'fine': 319, 'best': 317, 'powerful': 310, 'red': 300, 'aging': 298, 'concentrated': 298, 'texture': 289, 'firm': 283, 'cassis': 282, 'new': 282, 'power': 282, 'well': 279, 'cherries': 276, 'sample': 275, 'balance': 265, 'blend': 258, 'pinot': 258, 'beautiful': 256, 'toast': 255, 'balanced': 253, 'cola': 248, 'smoky': 246, 

In [13]:
def create_binary_bag_of_words(documents):
    vocabulary = set()
    binary_bow = []

    # build vocabulary
    for doc in documents:
        words = doc.split()
        for word in words:
            vocabulary.add(word)

    # create binary bag of words
    for doc in documents:
        binary_vector = []
        words = doc.split()
        for word in vocabulary:
            if word in words:
                binary_vector.append(1)
            else:
                binary_vector.append(0)
        binary_bow.append(binary_vector)

    return binary_bow, list(vocabulary)

In [14]:
df2 = pd.DataFrame({"Text": corpus, "Points":df["Points_Class"]})


In [15]:
df2

Unnamed: 0,Text,Points
0,This tremendous 100% varietal wine hails Oakvi...,4
1,"Ripe aromas fig, blackberry cassis softened sw...",4
2,Mac Watson honors memory wine made mother trem...,4
3,"This spent 20 months 30% new French oak, incor...",4
4,"This top wine La Bégude, named highest point v...",4
...,...,...
150925,Many people feel Fiano represents southern Ita...,3
150926,"Offers intriguing nose ginger, lime floral ele...",3
150927,This classic example comes cru vineyard called...,3
150928,"A perfect salmon shade, scents peaches, cherri...",2


In [16]:
X_train, X_test, y_train, y_test = train_test_split(df2["Text"], df2["Points"], test_size=0.2, random_state=20454593)

In [17]:
binary_bag_train, vocabulary_train = create_binary_bag_of_words(X_train)
X_train = binary_bag_train

In [18]:
one = np.count_nonzero(y_train == 1) / len(y_train)
two = np.count_nonzero(y_train == 2) / len(y_train)
three = np.count_nonzero(y_train == 3) / len(y_train)
four = np.count_nonzero(y_train == 4) / len(y_train)

print(one, two, three, four)

0.15364738620552573 0.6324123765984231 0.1940800371032929 0.019860200092758233


In [19]:
V_size = len(vocabulary_train)
V_size

70951

In [20]:
total_one_words = 0
total_two_words = 0
total_three_words = 0
total_four_words = 0


for i in range(len(X_train)):
    if y_train.iloc[i] == 1:
        for number in X_train[i]:
            total_one_words += number
    elif y_train.iloc[i] == 2:
        for number in X_train[i]:
            total_two_words += number
    elif y_train.iloc[i] == 3:
        for number in X_train[i]:
            total_three_words += number
    elif y_train.iloc[i] == 4:
        for number in X_train[i]:
            total_four_words += number

print(total_one_words)
print(total_two_words)
print(total_three_words)
print(total_four_words)

370362
1952574
713473
83637


In [21]:
one_prob = {}
two_prob = {}
three_prob = {}
four_prob = {}

for i in range(len(X_train)):
    if y_train.iloc[i] == 1:
        # print("one")
        for j in range(len(X_train[i])):
            if X_train[i][j] == 1:
                if vocabulary_train[j] not in one_prob:
                    one_prob[vocabulary_train[j]] = 2/(total_one_words+(1*V_size))
                else:
                    one_prob[vocabulary_train[j]] = one_prob[vocabulary_train[j]] + 1/(total_one_words+(1*V_size))
                
                if vocabulary_train[j] not in two_prob:
                    two_prob[vocabulary_train[j]] = 1/(total_two_words+(1*V_size))
                if vocabulary_train[j] not in three_prob:
                    three_prob[vocabulary_train[j]] = 1/(total_three_words+(1*V_size))
                if vocabulary_train[j] not in four_prob:
                    four_prob[vocabulary_train[j]] = 1/(total_four_words+(1*V_size))
                

    elif y_train.iloc[i] == 2:
        for j in range(len(X_train[i])):
            # print("two")
            if X_train[i][j] == 1:
                if vocabulary_train[j] not in two_prob:
                    two_prob[vocabulary_train[j]] = 2/(total_two_words+(1*V_size))
                else:
                    two_prob[vocabulary_train[j]] = two_prob[vocabulary_train[j]] + 1/(total_two_words+(1*V_size))
                
                if vocabulary_train[j] not in one_prob:
                    one_prob[vocabulary_train[j]] = 1/(total_one_words+(1*V_size))
                if vocabulary_train[j] not in three_prob:
                    three_prob[vocabulary_train[j]] = 1/(total_three_words+(1*V_size))
                if vocabulary_train[j] not in four_prob:
                    four_prob[vocabulary_train[j]] = 1/(total_four_words+(1*V_size))

    elif y_train.iloc[i] == 3:
        for j in range(len(X_train[i])):
            # print("three")
            if X_train[i][j] == 1:
                if vocabulary_train[j] not in three_prob:
                    three_prob[vocabulary_train[j]] = 2/(total_three_words+(1*V_size))
                else:
                    three_prob[vocabulary_train[j]] = three_prob[vocabulary_train[j]] + 1/(total_three_words+(1*V_size))
                
                if vocabulary_train[j] not in one_prob:
                    one_prob[vocabulary_train[j]] = 1/(total_one_words+(1*V_size))
                if vocabulary_train[j] not in two_prob:
                    two_prob[vocabulary_train[j]] = 1/(total_two_words+(1*V_size))
                if vocabulary_train[j] not in four_prob:
                    four_prob[vocabulary_train[j]] = 1/(total_four_words+(1*V_size))

    else:
        for j in range(len(X_train[i])):
            # print("four")
            if X_train[i][j] == 1:
                if vocabulary_train[j] not in four_prob:
                    four_prob[vocabulary_train[j]] = 2/(total_four_words+(1*V_size))
                else:
                    four_prob[vocabulary_train[j]] = four_prob[vocabulary_train[j]] + 1/(total_four_words+(1*V_size))
                
                if vocabulary_train[j] not in one_prob:
                    one_prob[vocabulary_train[j]] = 1/(total_one_words+(1*V_size))
                if vocabulary_train[j] not in two_prob:
                    two_prob[vocabulary_train[j]] = 1/(total_two_words+(1*V_size))
                if vocabulary_train[j] not in three_prob:
                    three_prob[vocabulary_train[j]] = 1/(total_three_words+(1*V_size))


In [22]:
mn = max(one_prob.values())
print(mn)

for k,v in one_prob.items():
    if v == mn:
        print(k, v)

0.014567891723109808
flavors 0.014567891723109808


In [23]:
mn = max(two_prob.values())
print(mn)

for k,v in two_prob.items():
    if v == mn:
        print(k, v)

0.013079650609695435
wine 0.013079650609695435


In [24]:
mn = max(three_prob.values())
print(mn)

for k,v in three_prob.items():
    if v == mn:
        print(k, v)

0.01273418457365138
wine 0.01273418457365138


In [25]:
mn = max(four_prob.values())
print(mn)

for k,v in four_prob.items():
    if v == mn:
        print(k, v)

0.007930757885476335
wine 0.007930757885476335


In [29]:
test_df = pd.DataFrame({"Text": X_test, "Points":y_test}).reset_index()
predictions = []
for row in test_df['Text']:
    one_prob = np.log(one)
    two_prob = np.log(two)
    three_prob = np.log(three)
    four_prob = np.log(four)

    # pos = Prob_pos
    # neg = Prob_neg
    for word in row.split():
        if word in vocabulary_train:
            one_prob += np.log(one[word])
            two_prob += np.log(two[word])
            three_prob += np.log(three[word])
            four_prob += np.log(four[word])
            # pos *= pos_prob[word]
            # neg *= neg_prob[word]
    if one_prob > two_prob and one_prob > three_prob and one_prob > four_prob:
        predictions.append(1)


TypeError: 'float' object is not subscriptable

In [27]:
def create_metrics(actual, predicted):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for i in range(len(actual)):
        if actual[i] == -1:
            if actual[i] == predicted[i]:
                TP += 1
            else:
                FN += 1
        elif actual[i] == 1:
            if actual[i] == predicted[i]:
                TN += 1
            else:
                FP += 1
    recall = TP/(TP+FN)
    specificity = TN/(TN+FP)
    precision = TP/(TP+FP)
    negative_predictive_value = TN/(TN+FN)
    accuracy = (TP+TN)/(TN+TP+FP+FN)
    F_score= 2*((recall*precision)/(recall+precision))

    return TP, TN, FP, FN, recall, specificity, precision, negative_predictive_value, accuracy, F_score