In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head(10)

Unnamed: 0,station_id,Aluminium_2019,Aluminium_2020,Ammonium_2019,Ammonium_2020,Boron_2019,Boron_2020,Chloride_2019,Chloride_2020,Coli-like-bacteria-Colilert_2019,...,Taste-ball-units_2020,Taste-dilution-degree_2019,Taste-dilution-degree_2020,Turbidity-NTU_2019,Turbidity-NTU_2020,pH _2019,pH _2020,compliance_2019,compliance_2020,compliance_2021
0,487,,,0.05,0.05,,,,,,...,0.0,,,1.6,0.2,7.8,7.6,0,0,0
1,1555,,,0.05,0.05,,,,,,...,,1.0,1.0,1.0,1.0,7.3,7.1,0,0,0
2,205,,10.0,0.05,0.24,,0.1,,,,...,,1.0,1.0,1.0,1.0,7.8,7.7,0,0,0
3,1228,,,0.09,0.05,0.641,0.932,,,,...,1.0,,,0.5,0.5,7.44,7.58,0,0,0
4,470,,,0.06,0.05,,,,,,...,0.0,,,0.2,44.0,7.7,8.0,0,1,0
5,614,,,0.12,0.13,,,,,,...,1.0,,,0.5,0.5,8.64,8.45,0,0,1
6,1420,,,,,,,,,0.0,...,,1.0,2.0,1.0,1.0,7.5,8.0,0,0,0
7,188,,,0.23,,,,,,,...,0.0,,,0.5,0.5,7.45,7.4,1,1,0
8,1286,,50.0,0.2,0.23,,0.091,,6.5,,...,,,,0.72,0.5,7.5,7.7,1,0,1
9,736,,,0.05,,,,,,,...,1.0,,,0.5,0.5,7.53,7.26,0,0,0


In [4]:
# new features based on two prev results
data['prev_two_probas'] = np.where((data['compliance_2019'] & data['compliance_2020'])==1,1,0)

In [5]:
# corr_matrix = data.corr()
# corr_matrix['compliance_2021'].sort_values(ascending=False)
# data = data.drop('compliance_2020', axis=1)
# data = data.drop('compliance_2019', axis=1)

In [6]:
X = data.drop(['compliance_2021'],axis=1)
y = data['compliance_2021']

In [7]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 7)

In [8]:
from sklearn.impute import SimpleImputer
num_imp = SimpleImputer(strategy = 'median')
num_imp.fit(X_train)

X_imp_train = pd.DataFrame(num_imp.transform(X_train))
X_imp_test = pd.DataFrame(num_imp.transform(X_test))
print(X_imp_train.shape)
print(X_imp_test.shape)

(352, 58)
(88, 58)


In [9]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
std.fit(X_imp_train)

X_train_std = std.transform(X_imp_train)
X_test_std = std.transform(X_imp_test)

In [10]:
from sklearn.utils import resample
print(data[data['compliance_2021']==0]['compliance_2021'].value_counts())
df_majority = data[data['compliance_2021']==1]
df_minority = data[data['compliance_2021']==0]

df_majority_downsampled = resample(df_majority, 
                                 replace=True,    # sample without replacement
                                 n_samples=374,     # to match minority class
                                 random_state=123) # reproducible results


# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
df_downsampled.sort_index(inplace=True)

y = df_downsampled['compliance_2021']
X = df_downsampled.drop('compliance_2021', axis=1)
print(y.value_counts())

0    374
Name: compliance_2021, dtype: int64
0    374
1    374
Name: compliance_2021, dtype: int64


In [11]:
# train_ratio = 0.8
# validation_ratio = 0.1
# test_ratio = 0.1

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)

# X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
#                                                 test_size=test_ratio/(test_ratio + validation_ratio), random_state=4) 


# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 7)

In [12]:
# replace nan values with median values by using simpleinputer 
num_imp = SimpleImputer(strategy = 'median')
num_imp.fit(X_train)

X_imp_train = pd.DataFrame(num_imp.transform(X_train))
X_imp_test = pd.DataFrame(num_imp.transform(X_test))
# X_imp_val = pd.DataFrame(num_imp.transform(X_val))

std = StandardScaler()
std.fit(X_imp_train)

X_train_std = std.transform(X_imp_train)
X_test_std = std.transform(X_imp_test)
# X_val_std = std.transform(X_imp_val)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
rfc = RandomForestClassifier(random_state=78, max_features = 1, max_depth = 20)
rfc.fit(X_train_std,y_train)
y_pred = rfc.predict(X_test_std)

confusion_matrix_result = confusion_matrix(y_test.values, y_pred)
print("Confusion matrix:\n%s" % confusion_matrix_result)
accuracy_score(rfc.predict(X_test_std), y_test)

print(f'Accuracy of model on the test data {accuracy_score(rfc.predict(X_test_std), y_test)}')
# print(f'Accuracy of model on the validation data {accuracy_score(rfc.predict(X_val_std), y_val)}')

Confusion matrix:
[[73  0]
 [ 0 77]]
Accuracy of model on the test data 1.0


In [23]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# best cv mean score
cv_dict = cross_val_score(RandomForestClassifier(random_state = 78, max_features = 1, max_depth = 20), X_train_std, y_train, cv=10)
# cv_dict_2 = cross_val_score(RandomForestClassifier(random_state = 92), X_test_std, y_test, cv=5)
# cv_dict = cross_val_score(RandomForestClassifier(random_state=28, max_features=2, max_depth=24, n_estimators = 105), X, Y, cv=5)



In [24]:
cv_dict.mean()

0.9766101694915253

In [16]:
# print(val)

In [28]:
best = 0.9715254237288136
ran = 0
for i in range(50, 100):
    cv_dict = cross_val_score(RandomForestClassifier(random_state = 78, max_features = 1, max_depth=i), X_train_std, y_train, cv=10)
    val = cv_dict.mean()
    if val > best:
        ran = i
        best = val
        print(best, i)

0.9749152542372881 50


In [18]:
# ran
ran

20

In [19]:
test_data = pd.read_csv('test.csv')
test_data.head()
test_data['prev_two_probas'] = test_data['compliance_2019'] + test_data['compliance_2020']
test_data.head()

Unnamed: 0,station_id,Aluminium_2019,Aluminium_2020,Ammonium_2019,Ammonium_2020,Boron_2019,Boron_2020,Chloride_2019,Chloride_2020,Coli-like-bacteria-Colilert_2019,...,Taste-ball-units_2020,Taste-dilution-degree_2019,Taste-dilution-degree_2020,Turbidity-NTU_2019,Turbidity-NTU_2020,pH _2019,pH _2020,compliance_2019,compliance_2020,prev_two_probas
0,163,5.0,5.0,0.08,0.08,0.071,0.062,130.0,102.0,,...,,1.0,1.0,1.18,1.9,8.17,8.12,0,0,0
1,167,,,0.08,0.08,,,,,,...,,1.0,3.0,3.9,1.54,7.83,7.81,1,0,1
2,171,,,,,,,112.0,90.0,,...,,2.0,1.0,1.4,1.5,7.5,7.6,0,1,1
3,174,5.0,5.0,0.05,0.09,0.072,0.075,248.0,243.0,0.0,...,,2.0,8.0,1.0,1.0,7.7,7.8,0,0,0
4,178,,,0.05,,,,,,,...,,1.0,1.0,1.0,1.0,7.5,7.4,0,0,0


In [25]:
# test_data = test_data.drop('compliance_2020', axis=1)
# test_data = test_data.drop('compliance_2019', axis=1)
test_data_imp = SimpleImputer(strategy = 'median')
test_data_imp.fit(test_data)

X_imp_test_data = pd.DataFrame(test_data_imp.transform(test_data))

std = StandardScaler()
std.fit(X_imp_test_data)

X_test_data = std.transform(X_imp_test_data)

In [26]:
y_pred = rfc.predict(X_test_data)
print(y_pred)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0]


In [27]:
df = pd.DataFrame()
df['station_id'] = test_data.station_id
df['compliance_2021'] = y_pred
df.to_csv('water.csv', index=False)

In [38]:
X = data[['compliance_2020', 'compliance_2019']]
y = data['compliance_2021']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
rfc = RandomForestClassifier(random_state=78)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

confusion_matrix_result = confusion_matrix(y_test.values, y_pred)
print("Confusion matrix:\n%s" % confusion_matrix_result)
accuracy_score(rfc.predict(X_test), y_test)

print(f'Accuracy of model on the test data {accuracy_score(rfc.predict(X_test), y_test)}')

Confusion matrix:
[[75  0]
 [13  0]]
Accuracy of model on the test data 0.8522727272727273
