In [137]:
import pandas as pd
import numpy as np

#feature engineering
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#train test split
from sklearn.model_selection import train_test_split


In [138]:
filepath = "./data/sangam.csv"

In [139]:

# Reading Dataset Train.csv
data = pd.read_csv(filepath)
data.head()

Unnamed: 0,Date,DO,pH,ORP,Cond,Temp,WQI,Status
0,2019-01-12 15:33:16,9.494212,13.765934,0.148402,12.954404,17.830261,54.811988,Very Poor
1,2019-01-12 15:34:17,9.500406,13.337535,0.144504,8.547796,17.798553,51.48805,Very Poor
2,2019-01-12 15:35:18,9.487448,13.198463,0.134372,16.847918,17.86493,50.420702,Very Poor
3,2019-01-12 15:36:19,9.486121,12.732116,0.142709,16.884756,17.871735,46.901646,Very Poor
4,2019-01-12 15:37:20,9.485211,13.284467,0.137524,16.987082,17.876404,51.104655,Very Poor


In [140]:

# Rows and columns
print(f'Training Dataset (row, col): {data.shape}')

Training Dataset (row, col): (52340, 8)


In [141]:
data.info(verbose=True, show_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52340 entries, 0 to 52339
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    52340 non-null  object 
 1   DO      52340 non-null  float64
 2   pH      52340 non-null  float64
 3   ORP     52340 non-null  float64
 4   Cond    52340 non-null  float64
 5   Temp    52340 non-null  float64
 6   WQI     52340 non-null  float64
 7   Status  52340 non-null  object 
dtypes: float64(6), object(2)
memory usage: 3.2+ MB


In [142]:

# Summary of Dataset Train
data.describe(include=[object, np.number])

Unnamed: 0,Date,DO,pH,ORP,Cond,Temp,WQI,Status
count,52340,52340.0,52340.0,52340.0,52340.0,52340.0,52340.0,52340
unique,52340,,,,,,,5
top,2019-01-12 15:33:16,,,,,,,Fair
freq,1,,,,,,,23461
mean,,8.238503,9.749467,0.099259,438.236518,25.334457,28.046169,
std,,0.54145,1.242788,0.074333,230.929059,3.11941,10.962818,
min,,7.257284,6.905927,-0.353938,1.053773,15.919769,5.135599,
25%,,7.921901,9.012969,0.085407,355.66433,23.058387,21.295417,
50%,,8.050535,9.519855,0.106075,394.07358,26.400803,26.666709,
75%,,8.566198,10.309565,0.114249,581.31051,27.297622,32.376637,


In [143]:
# Lets check missing Values
print('Data:\n')
print('Missing Values by Count: \n\n',
      data.isnull().sum().sort_values(ascending=False),'\n\nMissing Values by %:\n\n',
      data.isnull().sum().sort_values(ascending=False)/data.shape[0] * 100)

Data:

Missing Values by Count: 

 Date      0
DO        0
pH        0
ORP       0
Cond      0
Temp      0
WQI       0
Status    0
dtype: int64 

Missing Values by %:

 Date      0.0
DO        0.0
pH        0.0
ORP       0.0
Cond      0.0
Temp      0.0
WQI       0.0
Status    0.0
dtype: float64


In [144]:
# Handling Date column
data['Date'] = pd.to_datetime(data['Date'])
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day


In [145]:
data.drop('Date', axis=1, inplace=True)
data.head()

Unnamed: 0,DO,pH,ORP,Cond,Temp,WQI,Status,Year,Month,Day
0,9.494212,13.765934,0.148402,12.954404,17.830261,54.811988,Very Poor,2019,1,12
1,9.500406,13.337535,0.144504,8.547796,17.798553,51.48805,Very Poor,2019,1,12
2,9.487448,13.198463,0.134372,16.847918,17.86493,50.420702,Very Poor,2019,1,12
3,9.486121,12.732116,0.142709,16.884756,17.871735,46.901646,Very Poor,2019,1,12
4,9.485211,13.284467,0.137524,16.987082,17.876404,51.104655,Very Poor,2019,1,12


In [146]:
# # Using one-hot encoding
# status_encoded = pd.get_dummies(data['Status'], prefix='Status')
# data = pd.concat([data, status_encoded], axis=1)
# data.drop('Status', axis=1, inplace=True)

In [147]:
data.head()

Unnamed: 0,DO,pH,ORP,Cond,Temp,WQI,Status,Year,Month,Day
0,9.494212,13.765934,0.148402,12.954404,17.830261,54.811988,Very Poor,2019,1,12
1,9.500406,13.337535,0.144504,8.547796,17.798553,51.48805,Very Poor,2019,1,12
2,9.487448,13.198463,0.134372,16.847918,17.86493,50.420702,Very Poor,2019,1,12
3,9.486121,12.732116,0.142709,16.884756,17.871735,46.901646,Very Poor,2019,1,12
4,9.485211,13.284467,0.137524,16.987082,17.876404,51.104655,Very Poor,2019,1,12


In [148]:
# Perform one hot encoding
one_hot_encoded = pd.get_dummies(data['Status'], dtype=int)
data = pd.concat([data, one_hot_encoded], axis=1)
data.drop('Status', axis=1, inplace=True)


In [149]:
data.head()

Unnamed: 0,DO,pH,ORP,Cond,Temp,WQI,Year,Month,Day,Excellent,Fair,Good,Poor,Very Poor
0,9.494212,13.765934,0.148402,12.954404,17.830261,54.811988,2019,1,12,0,0,0,0,1
1,9.500406,13.337535,0.144504,8.547796,17.798553,51.48805,2019,1,12,0,0,0,0,1
2,9.487448,13.198463,0.134372,16.847918,17.86493,50.420702,2019,1,12,0,0,0,0,1
3,9.486121,12.732116,0.142709,16.884756,17.871735,46.901646,2019,1,12,0,0,0,0,1
4,9.485211,13.284467,0.137524,16.987082,17.876404,51.104655,2019,1,12,0,0,0,0,1


In [150]:
data = StandardScaler().fit_transform(data)


In [151]:
# Split the data into training and testing sets
df_train, df_test = train_test_split(data, test_size=0.3, random_state=42)


In [153]:
df_train.shape

(36638, 14)

In [154]:
df_test.shape

(15702, 14)