# Drawing Conclusions Using Groupby

In [3]:
#Import packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
% matplotlib inline

In [4]:
# Load `winequality_edited.csv`
wine_df = pd.read_csv('winequality_edited.csv')
wine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
fixed_acidity           6497 non-null float64
volatile_acidity        6497 non-null float64
citric_acid             6497 non-null float64
residual_sugar          6497 non-null float64
chlorides               6497 non-null float64
free_sulfur_dioxide     6497 non-null float64
total_sulfur_dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6497 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
color                   6497 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 659.9+ KB


### Is a certain type of wine associated with higher quality?

In [5]:
# Find the mean quality of each wine type (red and white) with groupby
wine_df.groupby('color')['quality'].mean()

color
red      5.636023
white    5.877909
Name: quality, dtype: float64

### What level of acidity receives the highest average rating?

In [7]:
# View the min, 25%, 50%, 75%, max pH values with Pandas describe
# High: Lowest 25% of pH values
# Moderately High: 25% - 50% of pH values
# Medium: 50% - 75% of pH values
# Low: 75% - max pH value
wine_df.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.215307,0.339666,0.318633,5.443235,0.056034,30.525319,115.744574,0.994697,3.218501,0.531268,10.491801,5.818378
std,1.296434,0.164636,0.145318,4.757804,0.035034,17.7494,56.521855,0.002999,0.160787,0.148806,1.192712,0.873255
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


In [8]:
# Bin edges that will be used to "cut" the data into groups
bin_edges = [ 2.72, 3.11,3.21 , 3.32,4.01 ] # Fill in this list with five values you just found

In [9]:
# Labels for the four acidity level groups
bin_names = [ 'High','Moderately High' ,'Meduim' ,'Low' ] # Name each acidity level category

In [10]:
# Creates acidity_levels column
wine_df['acidity_levels'] = pd.cut(wine_df['pH'], bin_edges, labels=bin_names)

# Checks for successful creation of this column
wine_df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color,acidity_levels
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,Low
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,Moderately High
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,Meduim
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,Moderately High
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,Low


In [11]:
# Find the mean quality of each acidity level with groupby
wine_df.groupby('acidity_levels').mean()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
acidity_levels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
High,7.543914,0.294683,0.370792,7.088876,0.055131,33.179965,129.897496,0.994708,3.029062,0.503937,10.330208,5.783343
Moderately High,7.365064,0.318551,0.340548,5.931984,0.054666,33.229154,126.815886,0.994697,3.164833,0.5093,10.391073,5.78454
Meduim,7.143566,0.346751,0.313585,4.721159,0.055715,28.983995,111.182138,0.994476,3.26701,0.541287,10.610369,5.850832
Low,6.769949,0.403815,0.243901,3.848983,0.058777,26.32751,93.244917,0.994899,3.433348,0.574136,10.656057,5.859593


In [13]:
# Save changes for the next section
wine_df.to_csv('winequality_edited.csv', index=False)

## QUIZ Q&A

### Is the mean quality of red wine greater than, less than, or equal to that of white wine?
Less

### What level of acidity receives the highest average rating?
Low acidity 