# Data Visualization Notebook for Vivino Wine Project

In [None]:
# Install modules 
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn

In [None]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read data file
If using this notebook in IBM Cloud, replace the code below with the code fragment provided by IBM Cloud for reading a file from the cloud storage.

Otherwise, replace the directory path in the file name below with the one where your data file is stored

In [None]:
# Read data from csv file
df = pd.read_csv('vivino_data_00-4_FINAL.csv')
df.info()

## 1.1 Set size of canvas for displaying plots + set up default geo_region sort order for plotting

In [None]:
# Set size of canvas for displaying plots
sns.set(rc={"figure.figsize":(15,10)}) #width=15, height=10

# Set up geo_region as the index for the dataframe
df.index = df['geo_region']

# Set up how we wish the plots to order the geographic regions during display
order_geo_region = ['W Europe', 'S Europe', 'C/E Europe', 'Oceania', 'N America', 'S America', 
                    'Africa', 'C/W Asia', 'E Asia']

## 1.2 Set 1 -- Pie Plots
This pie plot was eventually not used because I needed to spend too much time tweaking the chart. It's in this notebook to show how to create pie plots using the Seaborn library. How to get the text labels set up correctly, etc. is left as an item for you to explore / investigate.

In [None]:
## 1.2 Set 1 -- Pie Plots

# Set pie plot colours (outer ring and inner ring)
cmap = plt.get_cmap("tab20c") 
outer_colors = cmap(np.array([11,26,7,7,1,12,14,9,0])) 
inner_colors = cmap(np.array([4,7]))

count_geo_region = df['geo_region'].groupby(df.geo_region).value_counts()
count_country = df['country'].groupby(df.geo_region).value_counts()

# This plots the outer circle
plt.pie(count_country, startangle = 270, colors = outer_colors, counterclock = False,
        radius = 1.0, labeldistance = 1.2,
        textprops ={ 'fontweight': 'bold','fontsize':10}, 
        wedgeprops = { 'linewidth' : 5,'edgecolor' : 'white' })

# This plots the inner circle
plt.pie(count_geo_region, startangle = 270, colors = outer_colors, counterclock = False,
        radius = 0.8, labeldistance = 0.5, labels = order_geo_region, autopct = '%.1f%%',
        textprops ={ 'fontweight': 'bold','fontsize':12},
        wedgeprops = { 'linewidth' : 5,'edgecolor' : 'white' })##

# This sets up the donut hole
center_circle = plt.Circle((0,0), 0.2, color="white")
fig = plt.gcf()
fig.gca().add_artist(center_circle)

plt.axis('equal')  # equal aspect ratio
plt.tight_layout()

## 1.2 Set 2 -- Bar Plots

In [None]:
# (1.2) Set 2 -- Bar Plots (geo_region level)
# Hat-tip to Niithiya for the tip on declaring the Dataframe index in Step 1.1 to get this going

# Set up plot layout in the canvas (2 rows, 1 column)
fig, axes = plt.subplots(2,1)

# This sets up the hue order
hue_order=['Red', 'White']

# Set up the y axis tick marks and the scale
axes[0].set_yticks([0,1,2,3,4,5])
axes[0].set_ylim(bottom=0, top=6)

# This plots the bar plot of geo_region vs wine_rating, split by wine_type
sns.barplot(x=df.index, y=df['wine_rating'], hue=df['wine_type'], hue_order=hue_order, ci=None, palette='OrRd_r',
            order=order_geo_region, ax=axes[0]).set(title='Top: Wine Rating by Geographic Region\nBelow: Price by Geographic Region')

# Set up the y axis tick marks and the scale
axes[1].set_yticks([0,20,40,60,80,100,120])
axes[1].set_ylim(bottom=0, top=140)

# This plots the bar plot of geo_region vs wine_rating, split by wine_type
sns.barplot(x=df.index, y=df['price'], hue=df['wine_type'], hue_order=hue_order, ci=None, palette='GnBu_r',
            order=order_geo_region, ax=axes[1])

## 1.2 Set 3 -- Box Plots (geo_region level)
Notice how the geo_regions are ordered -- as per the configured default order in Step 1.1

In [None]:
# (1.2) Set 3 -- Box Plots (geo_region level)
# Set up plot layout in the canvas (2 rows, 1 column)
fig, axes = plt.subplots(2,1)

# This sets up the hue order
hue_order=['Red', 'White']

# This plots the box plot in the first row
graph = sns.boxplot(x=df.index, y=df['wine_rating'], hue=df['wine_type'], hue_order=hue_order, palette='OrRd_r', 
                    order=order_geo_region, ax=axes[0])

# This sets the title of the box plot in the first row
graph.set(title='Top: Wine Rating by Geographic Region\nBelow: Price by Geographic Region')

# This sets the title of the box plot in the second row
graph = sns.boxplot(x=df.index, y=df['price'], hue=df['wine_type'], hue_order=hue_order, palette='GnBu_r', 
                    order=order_geo_region, ax=axes[1])

## 1.2 Set 4 -- Box Plots (country level)

In [None]:
# (1.2) Set 4 -- Box Plots (country level)
# Set up plot layout in the canvas (2 rows, 1 column)
fig, axes = plt.subplots(2,1)

# This sets up the hue order
hue_order=['Red', 'White']

# Set up a new dataframe that will copy rows with geo_region == S Europe or C/E Europe
# We'll use this dataframe to plot the boxplots for the countries in these 2 geo_regions
df1 = df[(df.geo_region == 'S Europe') | (df.geo_region == 'C/E Europe')]
df1.index = df1['country']

# Plot the boxplot for wine_ratings for each country in these two geo_regions
graph = sns.boxplot(x=df1.index, y=df1['wine_rating'], hue=df1['wine_type'], palette='OrRd_r', ax=axes[0])
graph.set(title='Top: Wine Rating by Country (Europe)\nBelow: Price by Country (Europe)')

# Draw horizontal lines on the boxplot to show the global mean, global Red & global White wine_rating means
graph.axhline(df['wine_rating'].mean(), ls='--', c='g')
graph.axhline(df['wine_rating'][(df.wine_type=='Red')].mean(), ls='--', c='r')
graph.axhline(df['wine_rating'][(df.wine_type=='White')].mean(), ls='--', c='y')

# Plot the boxplots for price for each country in these two regions
graph = sns.boxplot(x=df1.index, y=df1['price'], hue=df1['wine_type'], palette='GnBu_r', ax=axes[1])

# Draw horizontal lines on the boxplot to show the global mean, global Red & global White price means
graph.axhline(df['price'].mean(), ls='--', c='r')
graph.axhline(df['price'][(df.wine_type=='Red')].mean(), ls='--', c='b')
graph.axhline(df['price'][(df.wine_type=='White')].mean(), ls='--', c='g')

## Exercise -- Draw the wine_rating and price boxplots for other countries outside Europe
Hint: Just look at the code in Step 1.3 and adjust accordingly. For the list of other geo_regions, look at the order_geo_region list declared in Step 1.1

## 1.2 Set 5 -- Heatmap Plots (Correlation Maps)

In [None]:
# (1.2) Set 5 -- Heatmap Plots (Correlation maps)
df.drop('year', axis=1, inplace=True)

# Set up plot layout in the canvas (1 row, 2 columns)
fig, axes = plt.subplots(1,2)

# Set up dataframe to shortlist countries in the Europe geo_regions
df1 = df[(df.geo_region == 'W Europe') | (df.geo_region == 'S Europe') | (df.geo_region == 'C/E Europe')]

# Plot the heatmap
sns.heatmap(df1.corr(), cmap="RdYlBu_r", annot=True, square=True, annot_kws={'size': 10}, ax=axes[0],
            cbar_kws={'shrink': 0.7}).set(title='Correlation Heat Map (Europe)\n')

# Set up dataframe to shortlist countries in the non-European geo_regions
df2 = df[(df.geo_region == 'N America') | (df.geo_region == 'S America') | \
        (df.geo_region == 'Oceania') | (df.geo_region == 'Africa') | (df.geo_region == 'C/W Asia')]

# Plot the heatmap
sns.heatmap(df2.corr(), cmap="RdBu_r", annot=True, square=True, annot_kws={'size': 10}, ax=axes[1],
            cbar_kws={'shrink': 0.7}).set(title='Correlation Heat Map (Outside Europe)\n')

## Exercise -- Plot Heatmaps for the other correlations, e.g. Red wines in Europe, etc. Have fun with various permutations of filters.
Hint: You need to learn the syntax for selecting data rows from a Pandas DataFrame, based on some selection criteria in some columns. Examples below.

In [None]:
# EXAMPLES

# Total no. of rows in DataFrame:
print ('Total no. of rows in Dataframe df =', len(df))

# Now we select those df rows with wine_type == 'Red' and copy them into df1 
# In other words, df1 is a DataFrame of red wines only
df1 = df[(df.wine_type == 'Red')]
print ('Total no. of rows in Dataframe df1 (red wines only) =', len(df1))

# Here is an example of compounding conditions, e.g. we want to select Red wines with price > 50
# We do this:
df1 = df[(df.wine_type =='Red') & (df.price > 50)]
print ('Total no. of rows in Dataframe df1 now (red wines priced > 50) =', len(df1))

# Here is an example of a more complex compounding condition, 
# e.g. we want to select White wines AND EITHER { price > 50 and price <= 110 } OR { grape == Riesling }
# It's just a matter of understanding the structure & sequence of conditions, 
# and then using the parentheses at the right places:

df1 = df[(df.wine_type =='White') & (((df.price > 50) & (df.price <= 110)) | (df.grape == 'Riesling'))]
print ('Total no. of rows in Dataframe df1 now (White) AND ((50 < price <= 110) or (grape is Riesling)) =', len(df1))


## 1.5 Set 5 -- Scattermap Plots

In [None]:
# (1.5) Set 5 -- Scattermap Plots
# Set up plot layout in the canvas (1 row, 1 column)
fig, axes = plt.subplots(1,1)

# This sets up the hue order
hue_order=['Red', 'White']

# Plot the scattermap
graph = sns.scatterplot(x=df.light_bold, y=df.price, data=df, 
                        hue=df.wine_type, hue_order=hue_order, palette='husl')
graph.set(title='Wine Boldness - Distribution & Price')

# End of Notebook