In [1]:
import pandas as pd
import plotly.express as px
import plotly.io as pio

In [2]:
data=pd.read_csv("../data/raw/milknew.csv")
data.head()

Unnamed: 0,pH,Temprature,Taste,Odor,Fat,Turbidity,Colour,Grade
0,6.6,35,1,0,1,0,254,high
1,6.6,36,0,1,0,1,253,high
2,8.5,70,1,1,1,1,246,low
3,9.5,34,1,1,0,1,255,low
4,6.6,37,0,0,0,0,255,medium


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1059 entries, 0 to 1058
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pH          1059 non-null   float64
 1   Temprature  1059 non-null   int64  
 2   Taste       1059 non-null   int64  
 3   Odor        1059 non-null   int64  
 4   Fat         1059 non-null   int64  
 5   Turbidity   1059 non-null   int64  
 6   Colour      1059 non-null   int64  
 7   Grade       1059 non-null   object 
dtypes: float64(1), int64(6), object(1)
memory usage: 66.3+ KB


## Missing values

In [4]:
data.isnull().sum()

pH            0
Temprature    0
Taste         0
Odor          0
Fat           0
Turbidity     0
Colour        0
Grade         0
dtype: int64

## Check for duplicates

In [5]:
data.groupby(data.columns.tolist(),as_index=False).size().sort_values(by=['size'])

Unnamed: 0,pH,Temprature,Taste,Odor,Fat,Turbidity,Colour,Grade,size
27,6.6,36,0,1,0,1,253,high,1
37,6.6,40,1,0,1,1,255,high,1
35,6.6,40,0,0,0,0,255,medium,1
33,6.6,38,0,1,1,1,255,high,1
62,6.8,41,1,1,1,0,255,high,1
...,...,...,...,...,...,...,...,...,...
58,6.8,40,1,0,1,0,245,medium,33
68,6.8,45,0,1,1,1,255,high,34
79,8.6,55,0,1,1,1,255,low,35
80,9.0,43,1,0,1,1,250,low,38


In [6]:
data_dropped=data.drop_duplicates(ignore_index=True)

data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83 entries, 0 to 82
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   pH          83 non-null     float64
 1   Temprature  83 non-null     int64  
 2   Taste       83 non-null     int64  
 3   Odor        83 non-null     int64  
 4   Fat         83 non-null     int64  
 5   Turbidity   83 non-null     int64  
 6   Colour      83 non-null     int64  
 7   Grade       83 non-null     object 
dtypes: float64(1), int64(6), object(1)
memory usage: 5.3+ KB


In [7]:
data_dropped.sample(10)

Unnamed: 0,pH,Temprature,Taste,Odor,Fat,Turbidity,Colour,Grade
54,6.8,36,0,1,1,0,253,high
33,6.7,41,1,0,0,0,247,medium
50,6.7,38,1,0,1,0,255,high
74,6.6,40,1,0,0,0,255,medium
57,6.8,41,0,0,1,0,255,medium
78,6.6,38,0,1,1,1,255,high
63,6.6,38,1,0,1,0,255,high
9,6.7,45,1,1,0,0,247,medium
16,4.7,38,1,0,1,0,255,low
0,6.6,35,1,0,1,0,254,high


## Target count

In [9]:
fig=px.bar(data_dropped['Grade'].value_counts(),
           orientation='h',
           labels={'value':'count'},
           text_auto=True,
           template='plotly_dark',
           width=800)
fig.update_layout(showlegend=False,
                  title=f'<b>Target proportion<br></b>')
fig.show()
pio.write_image(fig, 
                '../reports/target_proportion.png',
                scale=1)

## Numeric Distribution

In [11]:
from plot_utils import plot_continuos_histogram_matrix

In [12]:
fig=plot_continuos_histogram_matrix(data_dropped,2)
fig.show()
pio.write_image(fig, 
                '../reports/histogram_matrix.png',
                scale=1)

In [9]:
import os

In [None]:
if os.path.exists("../data/processed/data_milk.csv"):
    print("There is a file!")
else:
    data_dropped.to_csv("../data/processed/data_milk.csv")

In [5]:
data=pd.read_csv("../data/processed/data_milk.csv",index_col=0)

print(data.columns)

Index(['pH', 'Temprature', 'Taste', 'Odor', 'Fat ', 'Turbidity', 'Colour',
       'Grade'],
      dtype='object')


In [6]:
data.columns=['pH', 'Temp', 'Taste', 'Odor', 'Fat ', 'Turbidity', 'Colour','Grade']

data['Grade_sparse']=data['Grade'].replace(['low','medium','high'],[0,1,2])

data.head()


Unnamed: 0,pH,Temp,Taste,Odor,Fat,Turbidity,Colour,Grade,Grade_sparse
0,6.6,35,1,0,1,0,254,high,2
1,6.6,36,0,1,0,1,253,high,2
2,8.5,70,1,1,1,1,246,low,0
3,9.5,34,1,1,0,1,255,low,0
4,6.6,37,0,0,0,0,255,medium,1


In [10]:
if os.path.exists("../data/processed/data_milk_c.csv"):
    print("There is a file!")
else:
    data.to_csv("../data/processed/data_milk_c.csv")