Import Libraries

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
import statsmodels.api as sm

In [None]:
# Load the dataset
data = pd.read_csv('WineQT.csv')

In [None]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

In [None]:
# Display basic information about the dataset (data types, missing values, etc.)
print("\nDataset Info:")
data.info()

In [None]:
# Display basic statistical details about the dataset
print("\nStatistical Summary of the Dataset:")
print(data.describe())

Data Cleaning and Preprocessing

In [None]:
# Check for missing values
print("\nChecking for missing values:")
missing_values = data.isnull().sum()
print(missing_values)

In [None]:
# Check for duplicate rows
print("\nChecking for duplicate rows:")
duplicate_rows = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

In [None]:
# check unique values for id
unique_id = data['Id'].nunique()
print(f"Number of unique id: {unique_id}")

In [None]:
# Dropping the 'Id' column as it's just an identifier
data = data.drop(columns='Id')

In [None]:
# Rename columns
new_column_names = [
    'fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
    'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
    'ph', 'sulphates', 'alcohol', 'quality'
]
data.columns = new_column_names

In [None]:
# Box plots for each feature to inspect outliers
plt.figure(figsize=(18, 10))
plt.subplots_adjust(hspace=0.5)
for i, column in enumerate(data.columns[:-1], 1):  # excluding 'quality' column
    plt.subplot(3, 4, i)
    sns.boxplot(x=data[column])
    plt.title(f'Box plot of {column}')
plt.show()

In [None]:
# Further investigate the outliers using the IQR method
# Calculate Q1, Q3, and IQR for each feature
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

# Determine outliers using the IQR method
outliers_iqr = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR)))

# Print the outliers based on IQR method
print("Outliers based on IQR method:")
print(outliers_iqr)


In [None]:
# Handling Outliers - Capping at the 1st and 99th percentiles
for column in data.columns[:-1]:  # excluding 'quality' column
    percentiles = data[column].quantile([0.01, 0.99]).values
    data[column] = np.clip(data[column], percentiles[0], percentiles[1])

Data Transformation

In [None]:
# Creating a new feature 'free_sulfur_dioxide_ratio'
data['free_sulfur_dioxide_ratio'] = data['free_sulfur_dioxide'] / data['total_sulfur_dioxide']

Data Aggregation

In [None]:
# Aggregating data by 'quality' and computing the mean for each feature
print("\nData aggregated by wine quality:")
quality_summary = data.groupby('quality').mean()
print(quality_summary)

Data Visualization

In [None]:
# Correlation matrix
print("\nCorrelation Matrix:")
plt.figure(figsize=(12, 8))
corr_matrix = data.corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Box plots for each feature against wine quality
print("\nBox plots for each feature against wine quality:")
for column in data.columns[:-1]:  # excluding 'quality' column
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='quality', y=column, data=data)
    plt.title(f'{column} vs Quality')
    plt.show()

In [None]:
# Visualizing the distributions of the features
print("\nVisualizing the distributions of the features:")
plt.figure(figsize=(18, 10))
plt.subplots_adjust(hspace=0.5)
for i, column in enumerate(data.columns[:-1], 1):  # excluding 'quality' column
    plt.subplot(3, 4, i)
    sns.histplot(data[column], kde=True)
    plt.title(f'Distribution of {column}')
plt.show()# Quality Distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='quality', data=data, palette="viridis")
plt.title('Distribution of Wine Quality Ratings')
plt.show()

Statistical Analysis

In [None]:
# ANOVA for 'alcohol', 'sulphates', and 'volatile_acidity'
anova_results = {}
for feature in ['alcohol', 'sulphates', 'volatile_acidity']:
    model = ols(f'{feature} ~ C(quality)', data=data).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    anova_results[feature] = anova_table
    print(f"ANOVA results for {feature}:\n", anova_table, "\n")

In [None]:
# Tukey's HSD Test for 'alcohol', 'sulphates', and 'volatile_acidity'
tukey_results = {}
for feature in ['alcohol', 'sulphates', 'volatile_acidity']:
    tukey = pairwise_tukeyhsd(endog=data[feature], groups=data['quality'], alpha=0.05)
    tukey_results[feature] = tukey
    print(f"Tukey's HSD test results for {feature}:\n", tukey.summary(), "\n")
