In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import vega_datasets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from ipywidgets import Text, IntSlider, Dropdown

In [3]:
df = pd.read_csv("/Users/marijkekalinowski/Desktop/SciPy/SciPy_Project/winemag_data_130k_v2.csv")
df_short = df[["country", "price","variety","points","province"]]
df_short = df_short.dropna()

In [4]:
def to_list(df, category, min_size):
    '''
    Converts specific dataframe to a list, wich will be used in dropdown menu     
    '''
    series = df.groupby([category]).size()
    df_size = series.to_frame()
    df_minsize = df_size.drop(df_size[df_size.iloc[:,0] < min_size].index)
    
    df_minsize = df_minsize.reset_index()
    df_minsize = df_minsize.drop(df_minsize.columns[1], axis = 1)
    
    my_list = df_minsize[category].tolist()
    return my_list

In [5]:
countries = to_list(df_short, "country", 10)
varieties = to_list(df_short, "variety", 1000)
provinces = to_list(df_short, "province", 750)

In [27]:
def choose_data(df, category, choice):
    '''
    Customises data set to choosen category  
    '''
    df = df.loc[df[category] == choice]
    return df

In [28]:
def count_words(df):
    '''
    Counts the appearance of characteristic words in the descriptions
    '''
    #this will suppress a SettingWithCopyWarning entirely. For the porpuse of our project we can ignore the warning
    pd.options.mode.chained_assignment = None
    df["text_new"] = df.loc[:, "description"].str.lower().str.replace(r'[^\w\s]+','', regex=True)
    df_counted_words = df.text_new.str.split(expand=True).stack().value_counts()
    df_counted_flavs = df_counted_words.loc[["fruit", "acidity", "tannins", "cherry", "ripe", "spice", "rich", "fresh", 
                                             "berry", "plum", "soft", "apple", "blackberry", "sweet", "citrus"]]
    return df_counted_flavs

In [29]:
def plot_word_freqs(df, category, choice):
    '''
    Plots the frequencies of the most used words for each country 
    '''
    df_chosen = choose_data(df, category, choice)
    df_freqs = count_words(df_chosen)
    df_freqs.plot.bar(title='Wich is the most used word?', ylabel='Frequency')
    plt.show()
    
#using interactive to make the plot modifiable in real time
interactive(plot_word_freqs,
            df=fixed(df),
            category=fixed("country"),
            choice=widgets.Dropdown(options=countries, description='Country:', disabled=False))

interactive(children=(Dropdown(description='Country:', options=('Argentina', 'Australia', 'Austria', 'Brazil',…

With the plot we can compare which characteristic words are used to describe the wines in different countries. 