In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score

In [2]:
## simple svm
## checked the highest train accuracy through a graph and then noted the highest value of k 
## using the values of k and we select k best features and stores that in x_new 
## we have used GridSearchCV
## after estimating all the parameters we have finally used the model to do the prognosis
## accuracy and prediction 

In [3]:
##loading the db
data = pd.read_csv("Training.csv")

In [4]:
data.head()

Unnamed: 0,itching,skin_rash,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,muscle_wasting,vomiting,...,rusty_sputum,receiving_blood_transfusion,receiving_unsterile_injections,coma,stomach_bleeding,distention_of_abdomen,blood_in_sputum,palpitations,painful_walking,prognosis
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [5]:
#normalizing the x variables 
x_norm = data.drop(["prognosis"],axis=1)


In [6]:
x = preprocessing.normalize(x_norm)

In [7]:
y = data["prognosis"].values

In [8]:
### Training 
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2,random_state=1)

In [9]:
##classification ################################################################################

In [10]:
## without using grim 
from sklearn.metrics import confusion_matrix, accuracy_score
k = [1,10,5]

for i in k:
    svm=SVC(C=i,kernel = 'linear',random_state=0)
    svm.fit(x_train,y_train)
    y_pred = svm.predict(x_test)
    a=accuracy_score(y_test,y_pred)
    m = confusion_matrix(y_test, y_pred)
    print("i=",i)
    print("accuracy=",a)
    print("confusion matrix=",m)
    print("=================================================================")

i= 1
accuracy= 0.9939024390243902
confusion matrix= [[33  0  0 ...  0  0  0]
 [ 0 23  0 ...  0  0  0]
 [ 0  0 23 ...  0  0  0]
 ...
 [ 0  0  0 ... 22  0  0]
 [ 0  0  0 ...  0 23  0]
 [ 0  0  0 ...  0  0 34]]
i= 10
accuracy= 0.9939024390243902
confusion matrix= [[33  0  0 ...  0  0  0]
 [ 0 23  0 ...  0  0  0]
 [ 0  0 23 ...  0  0  0]
 ...
 [ 0  0  0 ... 22  0  0]
 [ 0  0  0 ...  0 23  0]
 [ 0  0  0 ...  0  0 34]]
i= 5
accuracy= 0.9939024390243902
confusion matrix= [[33  0  0 ...  0  0  0]
 [ 0 23  0 ...  0  0  0]
 [ 0  0 23 ...  0  0  0]
 ...
 [ 0  0  0 ... 22  0  0]
 [ 0  0  0 ...  0 23  0]
 [ 0  0  0 ...  0  0 34]]


In [11]:
##feature selection using annova computation
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import matplotlib.pyplot as plt

accuracy_list_train = []
k=np.arange(1,131,1)
for each in k:
    x_new = SelectKBest(f_classif, k=each).fit_transform(x_train, y_train)
    svm.fit(x_new,y_train)
    accuracy_list_train.append(svm.score(x_new,y_train))   
    
plt.plot(k,accuracy_list_train,color="green",label="train")
plt.xlabel("k values")
plt.ylabel("train accuracy")
plt.legend()
plt.show()

ValueError: k should be >=0, <= n_features = 100; got 101. Use k='all' to return all features.

In [None]:
### we can observe that with the inc in the value of the k the train accuracy also increases.

In [12]:
## sample 
d = {'best features number': k, 'train_score': accuracy_list_train}
df = pd.DataFrame(data=d)
print("max accuracy:",df["train_score"].max())
print("max accuracy id:",df["train_score"].idxmax())
print(" max accuracy values: \n", df.iloc[97])

ValueError: arrays must all be same length

In [13]:
## selecting the features  using best feature number 
selector = SelectKBest(f_classif, k = 98)
x_new = selector.fit_transform(x_train, y_train)
x_new_test=selector.fit_transform(x_test,y_test)


In [14]:
#GridSearchCV implements a “fit” and a “score” method. It also implements “score_samples”, “predict”, “predict_proba”, “decision_function”, “transform” and “inverse_transform” if they are implemented in the estimator used.
#C is a hypermeter which is set before the training model and used to control error and Gamma is also a hypermeter which is set before the training model and used to give curvature weight of the decision boundary.

In [14]:
from sklearn.model_selection import GridSearchCV

C=[1,5,10,15,20,25]
kernel=["linear","rbf"]
gamma=["auto",0.01,0.001,0.0001,1]
decision_function_shape=["ovo","ovr"]

In [15]:
svm_new=SVC(random_state=1)
grid_svm=GridSearchCV(estimator=svm_new,cv=5,param_grid=dict(kernel=kernel,C=C, gamma=gamma, decision_function_shape=decision_function_shape))
grid_svm.fit(x_new,y_train)
print("best score: ", grid_svm.best_score_)
print("best param: ", grid_svm.best_params_)

best score:  0.9883126181154418
best param:  {'C': 1, 'decision_function_shape': 'ovo', 'gamma': 'auto', 'kernel': 'linear'}


In [67]:
###############building model

In [16]:
svm_model=SVC(C=1,decision_function_shape="ovo",gamma="auto",kernel="linear",random_state=1)
svm_model.fit(x_new,y_train)

SVC(C=1, decision_function_shape='ovo', gamma='auto', kernel='linear',
    random_state=1)

In [17]:
print("train_accuracy:",svm_model.score(x_new,y_train))
print("test_accuracy: ", svm_model.score(x_new_test,y_test))

train_accuracy: 0.9883130081300813
test_accuracy:  0.9034552845528455


In [111]:
## prediction ########
y_pred=svm_model.predict(x_new_test)
y_pred

array(['Malaria', 'Chicken pox', 'Bronchial Asthma',
       'Chronic cholestasis', 'Fungal infection', 'Typhoid',
       'Tuberculosis', 'Fungal infection', 'Gastroenteritis',
       'Fungal infection', 'Fungal infection', 'Fungal infection',
       'Fungal infection', 'Fungal infection', 'Hypoglycemia',
       'Fungal infection', 'Gastroenteritis', 'Fungal infection',
       'Fungal infection', 'Gastroenteritis', 'Drug Reaction',
       'Gastroenteritis', 'Chicken pox', 'Cervical spondylosis',
       'Fungal infection', 'Typhoid', 'Malaria', 'Fungal infection',
       'Varicose veins', 'Gastroenteritis', 'Malaria', 'Fungal infection',
       'Malaria', 'Cervical spondylosis', 'Fungal infection',
       'Tuberculosis', 'Fungal infection', 'Allergy',
       'Cervical spondylosis', 'Fungal infection', 'Allergy',
       'Gastroenteritis', 'Varicose veins', 'Fungal infection', 'Dengue',
       'Cervical spondylosis', 'Fungal infection', 'Gastroenteritis',
       'Fungal infection', 'Fungal

In [112]:
##################
############
#################

###fetching the names of the columns  