In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

This data set breaks down personality type metrics by age, gender, interest, and education.

In [17]:
df=pd.read_csv('winequality-red.csv')
rows, columns=df.shape
print(f"There are {rows} rows")
print(f"There are {columns} columns")

There are 1599 rows
There are 12 columns


The rows are different wines.

In [20]:
print(df.columns)

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')


In [28]:
df2 = df.dtypes.reset_index()
df2["Type"]=["Numerical","Numerical","Numerical","Numerical","Numerical","Numerical","Numerical","Numerical","Numerical","Numerical","Numerical","Numerical"]
print(df2)

                   index        0       Type
0          fixed acidity  float64  Numerical
1       volatile acidity  float64  Numerical
2            citric acid  float64  Numerical
3         residual sugar  float64  Numerical
4              chlorides  float64  Numerical
5    free sulfur dioxide  float64  Numerical
6   total sulfur dioxide  float64  Numerical
7                density  float64  Numerical
8                     pH  float64  Numerical
9              sulphates  float64  Numerical
10               alcohol  float64  Numerical
11               quality    int64  Numerical


In [32]:
columns_to_analyze = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality']
statistics_list = []
for column in columns_to_analyze:
    mean_value = df[column].mean()
    median_value = df[column].median()
    mode_value = df[column].mode()[0] 
    std_dev_value = df[column].std()
    quartiles = df[column].quantile([0.25, 0.5, 0.75])
    statistics_list.append({
        'Column': column,
        'Mean': mean_value,
        'Median': median_value,
        'Mode': mode_value,
        'Standard Deviation': std_dev_value,
        '25th Percentile': quartiles[0.25],
        '50th Percentile': quartiles[0.5],
        '75th Percentile': quartiles[0.75]
    })
statistics_df = pd.DataFrame(statistics_list)
print(statistics_df)

                  Column       Mean    Median     Mode  Standard Deviation  \
0          fixed acidity   8.319637   7.90000   7.2000            1.741096   
1       volatile acidity   0.527821   0.52000   0.6000            0.179060   
2            citric acid   0.270976   0.26000   0.0000            0.194801   
3         residual sugar   2.538806   2.20000   2.0000            1.409928   
4              chlorides   0.087467   0.07900   0.0800            0.047065   
5    free sulfur dioxide  15.874922  14.00000   6.0000           10.460157   
6   total sulfur dioxide  46.467792  38.00000  28.0000           32.895324   
7                density   0.996747   0.99675   0.9972            0.001887   
8                     pH   3.311113   3.31000   3.3000            0.154386   
9              sulphates   0.658149   0.62000   0.6000            0.169507   
10               alcohol  10.422983  10.20000   9.5000            1.065668   
11               quality   5.636023   6.00000   5.0000          

In [None]:
There are no cate

In [None]:
missing_counts = df.isnull().sum() 
total_counts = df.shape[0]          
missing_percentage = (missing_counts / total_counts) * 100 

missing_summary_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Missing Percentage': missing_percentage
})
missing_summary_df['Summary'] = missing_summary_df.apply(
    lambda x: f'nulls: {x["Missing Count"]}/{total_counts} ({x["Missing Percentage"]:.1f}%)', axis=1)
print(missing_summary_df)
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_summary_df.index, y='Missing Count', data=missing_summary_df, hue=missing_summary_df.index, legend=False, palette='viridis')
plt.title('Missing Values Count in DataFrame')
plt.xlabel('Columns')
plt.ylabel('Missing Count')
plt.xticks(rotation=45)
plt.show()