In [1]:
import numpy as np


In [2]:
wine_data = np.genfromtxt('winequality-red.csv', delimiter=';', skip_header=1, dtype=np.float32)

In [3]:
print(wine_data)

[[ 7.4    0.7    0.    ...  0.56   9.4    5.   ]
 [ 7.8    0.88   0.    ...  0.68   9.8    5.   ]
 [ 7.8    0.76   0.04  ...  0.65   9.8    5.   ]
 ...
 [ 6.3    0.51   0.13  ...  0.75  11.     6.   ]
 [ 5.9    0.645  0.12  ...  0.71  10.2    5.   ]
 [ 6.     0.31   0.47  ...  0.66  11.     6.   ]]


In [4]:
print(wine_data.shape)


(1599, 12)


In [5]:
print(wine_data.size)

19188


In [6]:
print(wine_data.nbytes
     )

76752


In [7]:
rows = wine_data[[1,6,11], : ]
print(rows)

[[7.800e+00 8.800e-01 0.000e+00 2.600e+00 9.800e-02 2.500e+01 6.700e+01
  9.968e-01 3.200e+00 6.800e-01 9.800e+00 5.000e+00]
 [7.900e+00 6.000e-01 6.000e-02 1.600e+00 6.900e-02 1.500e+01 5.900e+01
  9.964e-01 3.300e+00 4.600e-01 9.400e+00 5.000e+00]
 [7.500e+00 5.000e-01 3.600e-01 6.100e+00 7.100e-02 1.700e+01 1.020e+02
  9.978e-01 3.350e+00 8.000e-01 1.050e+01 5.000e+00]]


In [8]:
clean_rows= rows[~np.isnan(rows).any(axis=1)]
print(clean_rows)

[[7.800e+00 8.800e-01 0.000e+00 2.600e+00 9.800e-02 2.500e+01 6.700e+01
  9.968e-01 3.200e+00 6.800e-01 9.800e+00 5.000e+00]
 [7.900e+00 6.000e-01 6.000e-02 1.600e+00 6.900e-02 1.500e+01 5.900e+01
  9.964e-01 3.300e+00 4.600e-01 9.400e+00 5.000e+00]
 [7.500e+00 5.000e-01 3.600e-01 6.100e+00 7.100e-02 1.700e+01 1.020e+02
  9.978e-01 3.350e+00 8.000e-01 1.050e+01 5.000e+00]]


In [9]:
alcohol_20 = np.any(wine_data[:, 10]>20)
print(alcohol_20)

False


In [10]:
average_alcohol = np.nanmean(wine_data[:, 10])
print(average_alcohol)

10.422984


In [11]:


pH_col = wine_data[:, 8]
stats = {
    'min': round(np.nanmin(pH_col), 2),
    'max': round(np.nanmax(pH_col), 2),
    '25%': round(np.nanpercentile(pH_col, 25), 2),
    '50%': round(np.nanpercentile(pH_col, 50), 2),  # median
    '75%': round(np.nanpercentile(pH_col, 75), 2),
    'mean': round(np.nanmean(pH_col), 2)
}
print("pH Statistics:", stats)



pH Statistics: {'min': np.float32(2.74), 'max': np.float32(4.01), '25%': np.float32(3.21), '50%': np.float32(3.31), '75%': np.float32(3.4), 'mean': np.float32(3.31)}


In [12]:
sulphates = wine_data[:, 9]  #10th column
quality = wine_data[:, 11]  #11th column

p20 = np.nanpercentile(sulphates, 20)
mask = sulphates < p20
low_sulphur_quality_mean = np.nanmean(quality[mask])

print(f"Low sulphate avg quality: {low_sulphur_quality_mean:.1f}")

Low sulphate avg quality: 5.2


In [13]:


quality = wine_data[:, 11]
best_quality = np.nanmax(quality)
worst_quality = np.nanmin(quality)

best_mask = quality == best_quality
worst_mask = quality == worst_quality

mean_best = np.nanmean(wine_data[best_mask], axis=0)
mean_worst = np.nanmean(wine_data[worst_mask], axis=0)

print("Mean for best quality wines:\n", mean_best)
print("Mean for worst quality wines:\n", mean_worst)



Mean for best quality wines:
 [ 8.566666    0.4233333   0.39111114  2.5777776   0.06844445 13.277778
 33.444443    0.99521226  3.2672222   0.76777774 12.094444    8.        ]
Mean for worst quality wines:
 [ 8.359999    0.8845      0.17099999  2.6350002   0.12249999 11.
 24.9         0.997464    3.398       0.57000005  9.955       3.        ]
