In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# File path to the csv files
csv_file_white = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
csv_file_red ="https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"

In [None]:
# Read white csv file into dataframe
df = pd.read_csv(csv_file_white, sep=";")
# set white as color
df["color"] = "white"

# Print first 5 rows in the dataframe
df.head()

In [None]:
# Read red wine csv file into dataframe
dfred = pd.read_csv(csv_file_red, sep=";")
# set red as color
dfred["color"] = "red"

# Print first 5 rows in the dataframe
dfred.head()

In [None]:
# merge the red and the white wine dataframes together
df = df.append(dfred, ignore_index = True)

In [None]:
# get descriptive statistics from the data
df.describe()

In [None]:
# set "color" as a categorical variable so we can use this for making dummies later on
df['color'] = pd.Categorical(df['color'])
# check the dataset
df.info()

In [None]:
# check the correlation between the variables
# compute the correlation matrix
cor = df.corr()
fig = plt.figure(figsize = (12,10))
sns.heatmap(cor, annot=True, cmap=plt.cm.Blues)
plt.show()

In [None]:
# make a dummy variable for "color" so we can use it in a regression
dfDummies = pd.get_dummies(df['color'], prefix = 'color')
df = pd.concat([df, dfDummies], axis=1)

In [None]:
df.describe()

In [None]:
# get some idea of the data
fig = plt.figure(figsize = (10,6))
sns.catplot(x="quality", y="fixed acidity", hue="color_white", kind="swarm", data=df)

In [None]:
# inspect the relationship between quality and volatile acidity
sns.catplot(x = 'quality', y = 'volatile acidity', hue="color_white", kind="swarm", data = df)

In [None]:
# inspect the relationship between quality and citric acid
sns.catplot(x="quality", y="citric acid", hue="color_white", kind="swarm", data=df)

In [None]:
# inspect the relationship between quality and residual sugar
sns.catplot(x = 'quality', y = 'residual sugar', hue="color_white", kind="swarm", data = df)

In [None]:
# inspect the relationship between quality and chlorides
sns.catplot(x = 'quality', y = 'chlorides', hue="color_white", kind="swarm", data = df)

In [None]:
# inspect the relationship between quality and free sulfur dioxide
sns.catplot(x = 'quality', y = 'free sulfur dioxide', hue="color_white", kind="swarm", data = df)

In [None]:
# inspect the relationship between quality and total sulfur dioxide
sns.catplot(x = 'quality', y = 'total sulfur dioxide', hue="color_white", kind="swarm", data = df)

In [None]:
# inspect the relationship between quality and density
sns.catplot(x = 'quality', y = 'density', hue="color_white", kind="swarm", data = df)

In [None]:
# inspect the relationship between quality and pH
sns.catplot(x = 'quality', y = 'pH', hue="color_white", kind="swarm", data = df)

In [None]:
# inspect the relationship between quality and sulphates
sns.catplot(x = 'quality', y = 'sulphates', hue="color_white", kind="swarm", data = df)

In [None]:
# inspect the relationship between quality and alcohol
sns.catplot(x = 'quality', y = 'alcohol', hue="color_white", kind="swarm", data = df)

In [None]:
df.to_csv(r'winedata.csv')