# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

- " %matplotlib inline " makes life easy by returning output plots without needing to write plt.show() code everytime after each plot!

# Loading dataset

In [None]:
df = pd.read_csv('winequality-white.csv',sep=';')
df.head() 

- Original data is seperated by delimiter " ; " in given dataset
- " .head() " returns first five observations of the dataset

# Data Insights

In [None]:
df.shape 

- dataset comprises of 4898 observations and 12 chracteriestics 
- out of which one is dependent variable and rest 11 are independent variables - physicochemical characteristics

In [None]:
df.columns.values 

- Label of each column

In [None]:
df.info() 

- Data has only float and integer values
- No variable column has null/missing values

# Summary Statistics

In [None]:
df.describe() 

#Key Observations - 
- Mean value is less than median value of each column represented by 50%(50th percentile) in index column.
- Natably large differnece in 75th %tile and max values of predictors "residual sugar","free sulfur dioxide","total sulfur dioxide"
- Thus observations 1 and 2 suggests that there are extreme values-Outliers in our dataset

# Understanding Target variable

In [None]:
df.quality.unique() 

- Target variable/Dependent variable is discrete and categorical in nature.
- "quality" score scale ranges from 1 to 10;where 1 being poor and 10 being the best.
- 1,2 & 10 Quality ratings are not given by any obseravtion.Only scores obtained are between 3 to 9.

In [None]:
df.quality.value_counts()

- This tells us vote count of each quality score in descending order.
- "quality" has most values concentrated in the categories 5, 6 and 7.
- Only a few observations made for the categories 3 & 9 

# Data Visualization

# To check missing values

In [None]:
sns.heatmap(df.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

- Dataset has no missing values.
- If there were any, you would've noticed in figure represented by different colour shade on purple background.
- Do try it out with other dataset which has missing values,you'll see the difference. 
- Ex.in titanic dataset,you will find "Age" and "Cabin" columns with differnt shades with this code. 

# To check correlation

In [None]:
plt.figure(figsize=(6,4))
sns.heatmap(df.corr(),cmap='Blues',annot=False) 

- Dark shades represents positive correlation while lighter shades represents negative correlation.
- If you set annot=True, you'll get values by which features are correlated to each other in grid-cells

In [None]:
#Quality correlation matrix
k = 12 #number of variables for heatmap
cols = df.corr().nlargest(k, 'quality')['quality'].index
cm = df[cols].corr()
plt.figure(figsize=(10,6))
sns.heatmap(cm, annot=True, cmap = 'viridis')

- Here we can infer that "density" has strong positive correlation with "residual sugar" whereas it has strong negative correlation with "alcohol".
- "free sulphur dioxide" and "citric acid" has almost no correlation with "quality"
- Since correlation is zero we can infer there is no linear relationship between these two predictors.However it is safe to drop these features in case you're applying Linear Regression model to the dataset. 

# To check Outliers

In [None]:
l = df.columns.values
number_of_columns=12
number_of_rows = len(l)-1/number_of_columns
plt.figure(figsize=(number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.set_style('whitegrid')
    sns.boxplot(df[l[i]],color='green',orient='v')
    plt.tight_layout()


- Except "alcohol" all other features columns shows outliers.
'''Color Codes : https://matplotlib.org/examples/color/colormaps_reference.html'''

# To check distribution-Skewness

In [None]:
plt.figure(figsize=(2*number_of_columns,5*number_of_rows))
for i in range(0,len(l)):
    plt.subplot(number_of_rows + 1,number_of_columns,i+1)
    sns.distplot(df[l[i]],kde=True) 


- "pH" column appears to be normally distributed
- remaining all independent variables are right skewed/positively skewed.   