In [30]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler,LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

# Shows plots in jupyter notebook
%matplotlib inline

# Set plot style
sns.set(color_codes=True)

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Shows all columns
pd.set_option('display.max_columns', None)

In [31]:
df_2018 = pd.read_csv('ground_water_quality_2018_post.csv')
df_2019 = pd.read_csv('ground_water_quality_2019_post.csv')
df_2020 = pd.read_csv('ground_water_quality_2020_post.csv')

In [32]:
df_2018.head(1)

Unnamed: 0,sno,district,mandal,village,lat_gis,long_gis,gwl,season,pH,E.C,TDS,CO3,HCO3,Cl,F,NO3,SO4,Na,K,Ca,Mg,T.H,SAR,Classification,RSC meq / L,Classification.1
0,1,ADILABAD,Adilabad,Adilabad,19.6683,78.5247,5.09,postmonsoon 2018,8.28,745,476.8,0,220.0,60,0.44,42.276818,46.0,49.0,4.0,48,38.896,279.934211,1.273328,C2S1,-1.198684,P.S.


In [33]:
df_2019.head(1)

Unnamed: 0,sno,district,mandal,village,lat_gis,long_gis,gwl,season,pH,EC,TDS,CO_-2,HCO_ -,Cl -,F -,NO3-,SO4-2,Na+,K+,Ca+2,Mg+2,T.H,SAR,Classification,RSC meq / L,Classification.1
0,1,ADILABAD,Adilabad,Adilabad,19.6683,78.5247,6.45,post monsoon 2019,8.32,2355,1507.2,30.0,320,340,0.64,66.435,377.0,273.0,113.0,80.0,82.654,539.860197,5.108509,C4S2,-3.797204,P.S.


In [34]:
df_2020.head(1)

Unnamed: 0,sno,district,mandal,village,lat_gis,long_gis,gwl,season,Unnamed: 8,pH,E.C,TDS,CO3,HCO3,Cl,F,NO3,SO4,Na,K,Ca,Mg,T.H,SAR,Classification,RSC meq / L,Classification.1
0,1,ADILABAD,Adilabad,Adilabad,19.6683,78.5247,7.1,Post-monsoon 2020,,8.01,1477,945.28,0.0,390,220,0.65,6.442182,33.0,171.0,9.0,24.0,72.93,359.876645,3.919146,C3S1,0.602467,P.S.


In [35]:
column_name_mappings={'EC':'E.C','CO_-2 ':'CO3', 'HCO_ - ':'HCO3', 'Cl -':'Cl', 'F -':'F','NO3- ':'NO3 ', 'SO4-2':'SO4', 'Na+':'Na', 'K+':'K', 'Ca+2':'Ca', 'Mg+2':'Mg'
                     }
df_2019.rename(columns=column_name_mappings,inplace=True)

In [36]:
df_2019.head(1)

Unnamed: 0,sno,district,mandal,village,lat_gis,long_gis,gwl,season,pH,E.C,TDS,CO3,HCO3,Cl,F,NO3,SO4,Na,K,Ca,Mg,T.H,SAR,Classification,RSC meq / L,Classification.1
0,1,ADILABAD,Adilabad,Adilabad,19.6683,78.5247,6.45,post monsoon 2019,8.32,2355,1507.2,30.0,320,340,0.64,66.435,377.0,273.0,113.0,80.0,82.654,539.860197,5.108509,C4S2,-3.797204,P.S.


In [37]:
df_2020.drop('Unnamed: 8',axis=1,inplace=True)

In [38]:
df_2018.isnull().sum()

sno                 0
district            0
mandal              0
village             0
lat_gis             0
long_gis            0
gwl                 3
season              0
pH                  0
E.C                 0
TDS                 0
CO3                 0
HCO3                0
Cl                  0
F                   0
NO3                 0
SO4                 0
Na                  0
K                   0
Ca                  0
Mg                  0
T.H                 0
SAR                 0
Classification      0
RSC  meq  / L       0
Classification.1    0
dtype: int64

In [39]:
df_2019.isnull().sum()

sno                   0
district              0
mandal                0
village               0
lat_gis               0
long_gis              0
gwl                   5
season                0
pH                    0
E.C                   0
TDS                   0
CO3                 160
HCO3                  0
Cl                    0
F                     0
NO3                   0
SO4                   0
Na                    0
K                     0
Ca                    0
Mg                    0
T.H                   0
SAR                   0
Classification        0
RSC  meq  / L         0
Classification.1      0
dtype: int64

In [40]:
df_2020.isnull().sum()

sno                 0
district            0
mandal              0
village             0
lat_gis             0
long_gis            0
gwl                 3
season              0
pH                  0
E.C                 0
TDS                 0
CO3                 0
HCO3                0
Cl                  0
F                   0
NO3                 0
SO4                 0
Na                  0
K                   0
Ca                  0
Mg                  0
T.H                 0
SAR                 0
Classification      0
RSC  meq  / L       0
Classification.1    0
dtype: int64

In [41]:
median_gwl_by_district_2018 = df_2018.groupby('district')['gwl'].median()
median_gwl_by_district_2019 = df_2019.groupby('district')['gwl'].median()
median_gwl_by_district_2020 = df_2020.groupby('district')['gwl'].median()

In [43]:
df_2018 = df_2018.fillna(median_gwl_by_district_2018[df_2018['district']])
df_2018.isnull().sum()

sno                 0
district            0
mandal              0
village             0
lat_gis             0
long_gis            0
gwl                 3
season              0
pH                  0
E.C                 0
TDS                 0
CO3                 0
HCO3                0
Cl                  0
F                   0
NO3                 0
SO4                 0
Na                  0
K                   0
Ca                  0
Mg                  0
T.H                 0
SAR                 0
Classification      0
RSC  meq  / L       0
Classification.1    0
dtype: int64

In [25]:
def impute_missing_with_median(row):
    if pd.isnull(row['gwl']):
        return median_gwl_by_district_2019[row['district']]
    else:
        return row['gwl']


df_2018['gwl'] = df_2018.apply(impute_missing_with_median, axis=1)
df_2019['gwl'] = df_2019.apply(impute_missing_with_median, axis=1)
df_2020['gwl'] = df_2020.apply(impute_missing_with_median, axis=1)

sno                 0
district            0
mandal              0
village             0
lat_gis             0
long_gis            0
gwl                 0
season              0
pH                  0
E.C                 0
TDS                 0
CO3                 0
HCO3                0
Cl                  0
F                   0
NO3                 0
SO4                 0
Na                  0
K                   0
Ca                  0
Mg                  0
T.H                 0
SAR                 0
Classification      0
RSC  meq  / L       0
Classification.1    0
dtype: int64