In [None]:
## Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.lm.preprocessing import flatten
from nltk.util import ngrams
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
import unicodedata
import stop_words
import spacy
from spacy.lang.en import stop_words

In [None]:
## Import your dataset and print dataframe.
df = pd.read_csv('wine-raitngs.csv')
print(df.head(5))

nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
#Check for missing data
missing_df =  df.isnull().sum().to_frame().rename(columns={0:"Total No. of Missing Values"})
missing_df["% of Missing Values"] = round((missing_df["Total No. of Missing Values"]/len( df))*100,2)
missing_df

df.dropna(inplace=True)
print(df.head(5))

nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')

In [None]:
# Normalize encodings
df['notes'] = df['notes'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore'))
df['notes'].head(5)

#Lowercase text
df['notes'] = df['notes'].str.lower()
df['notes'].head(5)

#Remove punctuation
df['notes'] = df['notes'].str.replace(r'[^\w\s]','', regex = True)
df['notes'].head(5)

#Remove numbers
df['notes'] = df['notes'].str.replace('\d+', '', regex=True)
df['notes'].head(5)


#Remove stopwords
stop_words = stop_words.STOP_WORDS
df['notes'] = df['notes'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df['notes'].head(5)

In [None]:
#Summarize at least one metadata variable.
df['variety'].value_counts(ascending = False)

In [None]:
#Plot the 30 most frequent terms in the text.
def to_list(strg_list):
  return strg_list.strip("[]").replace("'","").replace('"',"").replace(",","").split()

df['ngram']=[list(ngrams(to_list(strg_list),3))for strg_list in df['notes']]
count_ngram= Counter(list(flatten([list_item for list_item in df['ngram']])))
print(sorted(list(count_ngram.items())[0:100]))

In [None]:
# Unigrams Bargraph
unigrams = df['notes'].str.split(expand=True).stack().value_counts()[0:30]
unigrams.plot(kind = 'barh')

In [None]:
#Create a word cloud of the text.
df_wine = df[df['region'].str.contains("California")]
wine = df_wine['notes'].str.cat(sep=' ')

wc = WordCloud().generate(wine)
plt.imshow(wc)
plt.axis("off")
plt.show()