In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

In [3]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [4]:
df_train.head()

Unnamed: 0,ID,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity,class
0,0,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
1,1,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
2,2,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
3,3,66,Male,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,No,Yes,Yes,No,No,Positive
4,4,67,Male,Yes,Yes,No,Yes,Yes,Yes,No,Yes,Yes,No,Yes,Yes,No,Yes,Positive


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 364 entries, 0 to 363
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ID                  364 non-null    int64 
 1   age                 364 non-null    int64 
 2   gender              364 non-null    object
 3   polyuria            364 non-null    object
 4   polydipsia          364 non-null    object
 5   sudden_weight_loss  364 non-null    object
 6   weakness            364 non-null    object
 7   polyphagia          364 non-null    object
 8   genital_thrush      364 non-null    object
 9   visual_blurring     364 non-null    object
 10  itching             364 non-null    object
 11  irritability        364 non-null    object
 12  delayed_healing     364 non-null    object
 13  partial_paresis     364 non-null    object
 14  muscle_stiffness    364 non-null    object
 15  alopecia            364 non-null    object
 16  obesity             364 no

In [6]:
print(df_train.isnull().sum())

ID                    0
age                   0
gender                0
polyuria              0
polydipsia            0
sudden_weight_loss    0
weakness              0
polyphagia            0
genital_thrush        0
visual_blurring       0
itching               0
irritability          0
delayed_healing       0
partial_paresis       0
muscle_stiffness      0
alopecia              0
obesity               0
class                 0
dtype: int64


In [7]:
norminal_transformer = Pipeline(steps=[
    ('Encoder',OrdinalEncoder())
])
numerical_scaler = Pipeline(steps=[
    ('Scaler',MinMaxScaler())
])

In [8]:
len(df_train.columns)

18

In [9]:
class FeatureEngineering:
    def __init__(self, add_attributes=True):
        
        self.add_attributes = add_attributes
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        if self.add_attributes:
            df = X.copy()
            df = df.drop(columns=['ID'])
            
            return df
        else:
            return df

In [10]:
class FeatureEngineering2:
    def __init__(self, add_attributes=True):
        
        self.add_attributes = add_attributes
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        if self.add_attributes:
            df = X.copy()
            
   
            S = pd.DataFrame(df, columns=df_train.drop(columns=['ID','class']).columns)
            
            return S
        else:
            return S

In [11]:
FeatureEngineering2 = FeatureEngineering2(add_attributes = True)

In [12]:
FeatureEngineering = FeatureEngineering(add_attributes = True)

In [13]:
norminal_features = df_train.select_dtypes(exclude='number').columns.to_list()

In [14]:
preprocess = ColumnTransformer([
    ('numerical', numerical_scaler, ['age']),
    ('norminal', norminal_transformer, ['gender','polyuria','polydipsia','sudden_weight_loss','weakness','polyphagia','genital_thrush','visual_blurring','itching','irritability','delayed_healing','partial_paresis','muscle_stiffness','alopecia','obesity'])
])

In [15]:
pipeline = make_pipeline(FeatureEngineering, preprocess,FeatureEngineering2)

In [16]:
pipeline

In [17]:
X = pipeline.fit_transform(df_train)

In [18]:
X


Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity
0,0.337838,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
1,0.391892,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.594595,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.675676,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,0.689189,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359,0.621622,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
360,0.310811,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
361,0.432432,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
362,0.567568,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


In [19]:
labelen = LabelEncoder()

In [20]:
y = labelen.fit_transform(df_train['class'])

In [21]:
pd.DataFrame(y)

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
359,1
360,1
361,1
362,1


In [22]:
test_set = pipeline.fit_transform(df_test)

In [23]:
test_set

Unnamed: 0,age,gender,polyuria,polydipsia,sudden_weight_loss,weakness,polyphagia,genital_thrush,visual_blurring,itching,irritability,delayed_healing,partial_paresis,muscle_stiffness,alopecia,obesity
0,0.476190,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.444444,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.047619,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.285714,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
4,0.317460,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,0.047619,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
152,0.174603,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153,0.158730,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
154,0.365079,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [24]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0,stratify=y)

**_Logistic Regression_**

In [25]:
LR_model = LogisticRegression()
LR_model.fit(X = X_train, y = y_train)

In [26]:
lr_predict = LR_model.predict(X_valid)
acc_lr = accuracy_score(y_valid, lr_predict)
acc_lr

0.9315068493150684

**_XGBoost_**

In [27]:
import xgboost as xgb

In [28]:
XG_model = xgb.XGBClassifier()
XG_model.fit(X = X_train, y = y_train)

In [29]:
xg_predict = XG_model.predict(X_valid)
acc_xg = accuracy_score(y_true = y_valid,y_pred = xg_predict)
acc_xg

1.0

**_Random Forest_**

In [30]:
from sklearn.ensemble import RandomForestClassifier

In [31]:
RF_model = RandomForestClassifier(n_estimators=50, random_state=0)
RF_model.fit(X = X_train, y = y_train)

In [32]:
rf_predict = RF_model.predict(X_valid)
acc_rf = accuracy_score(y_true = y_valid,y_pred = rf_predict)
acc_rf

0.9863013698630136

**_CatBoost_**

In [33]:
import catboost as cat

In [34]:
CB_model = cat.CatBoostClassifier()
CB_model.fit(X = X_train, y = y_train,verbose=0)

<catboost.core.CatBoostClassifier at 0x1bc4e189450>

In [35]:
cb_predict = CB_model.predict(X_valid)
acc_cb = accuracy_score(y_true = y_valid,y_pred = cb_predict)
acc_cb

0.9863013698630136

**_K-nearest_**

In [36]:
from sklearn.neighbors import KNeighborsClassifier

In [37]:
kn_model = KNeighborsClassifier()
kn_model.fit(X = X_train, y = y_train)

In [38]:
kn_predict = kn_model.predict(X = X_valid)
acc_kn = accuracy_score(y_true = y_valid, y_pred = kn_predict)
acc_kn

0.9041095890410958

**_Voting Classification_**

In [39]:
from sklearn.ensemble import VotingClassifier

In [40]:
estimator = [
    ('logistic regression', LR_model),
    ('XGBoost', XG_model),
    ('RandomForest', RF_model),
    ('CatBoost',CB_model),
    ('KNeighbors',kn_model)

]

In [41]:
voting_model = VotingClassifier(estimators=estimator, voting='hard',verbose=0)

In [42]:
voting_model.fit(X=X_train, y=y_train)

Learning rate set to 0.006082
0:	learn: 0.6876074	total: 2.28ms	remaining: 2.27s
1:	learn: 0.6807894	total: 3.67ms	remaining: 1.83s
2:	learn: 0.6745673	total: 5.62ms	remaining: 1.87s
3:	learn: 0.6681918	total: 7.61ms	remaining: 1.9s
4:	learn: 0.6622061	total: 9.36ms	remaining: 1.86s
5:	learn: 0.6553732	total: 10.7ms	remaining: 1.77s
6:	learn: 0.6485648	total: 12ms	remaining: 1.7s
7:	learn: 0.6423371	total: 13ms	remaining: 1.62s
8:	learn: 0.6348453	total: 14.6ms	remaining: 1.61s
9:	learn: 0.6282148	total: 16.2ms	remaining: 1.61s
10:	learn: 0.6219786	total: 17.7ms	remaining: 1.59s
11:	learn: 0.6166707	total: 19.2ms	remaining: 1.58s
12:	learn: 0.6097597	total: 21.4ms	remaining: 1.63s
13:	learn: 0.6025725	total: 22.7ms	remaining: 1.6s
14:	learn: 0.5973711	total: 24.1ms	remaining: 1.58s
15:	learn: 0.5918434	total: 25.5ms	remaining: 1.57s
16:	learn: 0.5857558	total: 26.9ms	remaining: 1.56s
17:	learn: 0.5809961	total: 28.3ms	remaining: 1.55s
18:	learn: 0.5758968	total: 29.9ms	remaining: 1.54s

In [43]:
voting_pred = voting_model.predict(X = X_valid)

In [44]:
final_acc = accuracy_score(y_true=y_valid,y_pred=voting_pred)
final_acc

0.9863013698630136

In [45]:
from sklearn.ensemble import StackingClassifier

In [46]:
estimator2 = [
    ('XGBoost', XG_model),
    ('RandomForest', RF_model),
    ('CatBoost',CB_model),
    ('KNeighbors',kn_model)

]

In [47]:
stacked = StackingClassifier(estimators=estimator, final_estimator=LR_model)

In [48]:
stacked.fit(X=X_train, y=y_train)

Learning rate set to 0.006082
0:	learn: 0.6876074	total: 1.59ms	remaining: 1.59s
1:	learn: 0.6807894	total: 2.76ms	remaining: 1.38s
2:	learn: 0.6745673	total: 4.05ms	remaining: 1.35s
3:	learn: 0.6681918	total: 7.02ms	remaining: 1.75s
4:	learn: 0.6622061	total: 8.53ms	remaining: 1.7s
5:	learn: 0.6553732	total: 9.8ms	remaining: 1.62s
6:	learn: 0.6485648	total: 11.1ms	remaining: 1.57s
7:	learn: 0.6423371	total: 12ms	remaining: 1.49s
8:	learn: 0.6348453	total: 13.3ms	remaining: 1.46s
9:	learn: 0.6282148	total: 15.1ms	remaining: 1.5s
10:	learn: 0.6219786	total: 17.3ms	remaining: 1.55s
11:	learn: 0.6166707	total: 18.6ms	remaining: 1.53s
12:	learn: 0.6097597	total: 19.7ms	remaining: 1.5s
13:	learn: 0.6025725	total: 21.3ms	remaining: 1.5s
14:	learn: 0.5973711	total: 22.9ms	remaining: 1.51s
15:	learn: 0.5918434	total: 24.4ms	remaining: 1.5s
16:	learn: 0.5857558	total: 25.6ms	remaining: 1.48s
17:	learn: 0.5809961	total: 26.7ms	remaining: 1.46s
18:	learn: 0.5758968	total: 27.7ms	remaining: 1.43s


In [49]:
stacked_pred = stacked.predict(X = X_valid)

In [50]:
stacked_acc = accuracy_score(y_true=y_valid,y_pred=stacked_pred)
stacked_acc

1.0

In [70]:
print('LogisticRegression',acc_lr)
print('RandomForestClassifier',acc_rf)
print('XGBoost',acc_xg)
print('KNeighborsClassifier',acc_kn)
print('CatBoost',acc_cb)
print('VotingClassifier', final_acc)
print('StackClassifier', stacked_acc)

LogisticRegression 0.9315068493150684
RandomForestClassifier 0.9863013698630136
XGBoost 1.0
KNeighborsClassifier 0.9041095890410958
CatBoost 0.9863013698630136
VotingClassifier 0.9863013698630136
StackClassifier 1.0


**Without feature engineering**
    - LogisticRegression 0.9041095890410958
    - RandomForestClassifier 0.958904109589041
    - XGBoost 0.958904109589041
    - KNeighborsClassifier 0.9041095890410958
    - CatBoost 0.9452054794520548
    - VotingClassifier 0.9452054794520548

In [71]:
y_pred = stacked.predict(test_set)

In [65]:
df_submission = pd.read_csv('sample_submission.csv')

In [66]:
mapping = {1:'Positive',0:"Negative"}
prediction = pd.DataFrame(y_pred)

In [67]:
prediction = prediction.replace(mapping)

In [68]:
df_submission['class'] = prediction

In [69]:
df_submission
df_submission.to_csv('submission.csv',index=False)