In [3]:
import pandas as pd
import numpy as np
import warnings
from sklearn.metrics import accuracy_score
import time
warnings.filterwarnings("ignore")

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Mapping


In [5]:
prognosis_dict={"Malaria":0,"Lyme_disease":1,
               "Plague":2,"Zika":3,"Yellow_Fever":4,
               "Dengue":5,"Chikungunya":6,"Rift_Valley_fever":7,
               "Tungiasis":8,"Japanese_encephalitis":9,
               "West_Nile_fever":10}
train["prognosis"] = train["prognosis"].map(prognosis_dict)

# Train test split

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [47]:
sc = StandardScaler()
y=train["prognosis"]
X=train.drop(["prognosis","id"],axis=1)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.2)
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [8]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
#Grid search 
def grid(model,params):
    grid = GridSearchCV(estimator=model,cv=5,refit=True,n_jobs=-1,param_grid=params,error_score='raise')
    grid.fit(X_train,y_train)
    return grid.best_params_
def randomized(model,params):
    randomized = RandomizedSearchCV(estimator=model, param_distributions=params, random_state=42, n_iter=200, cv=3, verbose=0, n_jobs=1, return_train_score=True)
    randomized.fit(X_train,y_train)
    return randomized.best_params_

In [9]:
def model(model):
    model.fit(X_train,y_train)
    pred = model.predict(X_test)
    return accuracy_score(y_test,pred)

# Best Model

**Logistic Reg**

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
LogisticRegression().get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [10]:
log_reg_params = {"penalty" : ['l2'],
                 "C":np.logspace(-4, 4, 20),
                 "fit_intercept":[True,False],
                 "max_iter":[2000]}

In [11]:
grid(LogisticRegression(),log_reg_params)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

{'C': 0.03359818286283781,
 'fit_intercept': True,
 'max_iter': 2000,
 'penalty': 'l2'}

In [21]:
lr = LogisticRegression(C= 0.03359818286283781,
                    fit_intercept=True,
                    max_iter= 2000,
                    penalty= 'l2')

In [22]:
lr_score = model(lr)

**SVM**

In [23]:
from sklearn.svm import SVC

In [15]:
SVC().get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [16]:
param_svc = {'C': [0.1,1, 10, 100,1000], 'gamma': [1000,100,10,1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid(SVC(),param_svc)

{'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}

In [28]:
svc = SVC(C=1,
   gamma=0.01,
   kernel="rbf")
svc_score=model(svc)

**KNN**

In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [26]:
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [20]:
params_knn = { 'n_neighbors' : np.arange(1,50,1),
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
grid(KNeighborsClassifier(),params_knn)

{'metric': 'manhattan', 'n_neighbors': 19, 'weights': 'uniform'}

In [27]:
knn=KNeighborsClassifier(metric="manhattan",
                    n_neighbors=19,
                    weights="uniform")
knn_score=model(knn)

**XGBoost**

In [16]:
from xgboost import XGBClassifier

In [17]:
XGBClassifier().get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [18]:
xgb_params = {
    'learning_rate': [0.01, 0.1, 0.5],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.5]
}
randomized(XGBClassifier(),xgb_params)

{'subsample': 0.5,
 'n_estimators': 100,
 'max_depth': 3,
 'learning_rate': 0.01,
 'gamma': 0.1,
 'colsample_bytree': 0.5}

In [20]:
xgb = XGBClassifier(subsample=0.5,n_estimators=100,
             learning_rate=0.01,max_depth=3,
             gamma=0.1,colsample_bytree=0.5)
xgb_score = model(xgb)

**RandomForest**

In [13]:
from sklearn.ensemble import RandomForestClassifier

In [14]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [15]:
random_params={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200,300,400,500],
              'max_features':[1,3,5,7],
              'min_samples_leaf':np.arange(1,10,1),
              'min_samples_split':np.arange(1,10,1)}
randomized(RandomForestClassifier(random_state=1),random_params)

KeyboardInterrupt: 

In [29]:
rfc = RandomForestClassifier(n_estimators=200,
                      min_samples_split=3,
                      min_samples_leaf=5,
                      max_features=7,
                      max_depth=10,
                      random_state=11)
rfc_score = model(rfc)

**Decision Tree**

In [31]:
from sklearn.tree import DecisionTreeClassifier

In [32]:
DecisionTreeClassifier().get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [51]:
dtc_params= {"max_depth":[3,4,5,None,6,7,8,9,10],
            "max_features":np.arange(1,20,1),
            "min_samples_leaf":np.arange(1,30,1),
            "min_samples_split":np.arange(1,30,1)}
randomized(DecisionTreeClassifier(random_state=1),dtc_params)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


{'min_samples_split': 28,
 'min_samples_leaf': 4,
 'max_features': 6,
 'max_depth': None}

In [33]:
dtc_score = model(DecisionTreeClassifier(random_state=1))

**Naive Bayes**

In [35]:
from sklearn.naive_bayes import GaussianNB

In [36]:
nb_score = model(GaussianNB())

In [37]:
model_dict={"Logistic Regression":lr_score,
           "SVM":svc_score,
           "KNN":knn_score,
           "XGBoost":xgb_score,
           "Random Forest":rfc_score,
           "Decission Tree":dtc_score,
           "Naive Bayes":nb_score}

In [39]:
pd.DataFrame([model_dict])

Unnamed: 0,Logistic Regression,SVM,KNN,XGBoost,Random Forest,Decission Tree,Naive Bayes
0,0.288732,0.309859,0.239437,0.288732,0.316901,0.147887,0.15493


In [41]:
X_test_test = sc.fit_transform(test.drop(["id"],axis=1))

In [49]:
rfc_f = RandomForestClassifier(n_estimators=200,
                      min_samples_split=3,
                      min_samples_leaf=5,
                      max_features=7,
                      max_depth=10,
                      random_state=11)

In [55]:
all_pred = rfc_f.fit(X_train,y_train).predict(X_test_test)

In [57]:
sub_dict ={ "id":test["id"],"prognosis":all_pred }

In [59]:
sub_df = pd.DataFrame(sub_dict)

In [61]:
prognosis_dict_re={0:"Malaria",1:"Lyme_disease",
               2:"Plague",3:"Zika",4:"Yellow_Fever",
               5:"Dengue",6:"Chikungunya",7:"Rift_Valley_fever",
               8:"Tungiasis",9:"Japanese_encephalitis",
               10:"West_Nile_fever"}
sub_df["prognosis"] = sub_df["prognosis"].map(prognosis_dict_re)

In [64]:
sub_df.to_csv("submission.csv",index=False)