# Acute Inflammations Data Set
## Written by Abiola Obembe
### Date: 13-06-2020

In [96]:
# import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
print('libraries installed!')

libraries installed!


In [97]:
# import dataset
df = pd.read_csv('diagnosis_data.csv')
df.head()

Unnamed: 0,Temperature of patient,Occurrence of nausea,Lumbar pain,Urine pushing (continuous need for urination),Micturition pains,Burning of urethra. itch. swelling of urethra outlet,Inflammation of urinary bladder,Nephritis of renal pelvis origin
0,35.5,no,yes,no,no,no,no,no
1,35.9,no,no,yes,yes,yes,yes,no
2,35.9,no,yes,no,no,no,no,no
3,36.0,no,no,yes,yes,yes,yes,no
4,36.0,no,yes,no,no,no,no,no


## Data Cleaning and preprocessing

In [98]:
# Check for missing values

missing_values = df.isnull().sum()
print("The number of missing values in the dataframe is:",missing_values.sum())

The number of missing values in the dataframe is: 0


In [99]:
# List the datatypes or object on dataframe
df.dtypes

 Temperature of patient                                  float64
Occurrence of nausea                                      object
Lumbar pain                                               object
Urine pushing (continuous need for urination)             object
Micturition pains                                         object
 Burning of urethra. itch. swelling of urethra outlet     object
Inflammation of urinary bladder                           object
Nephritis of renal pelvis origin                          object
dtype: object

In [100]:
# Number of rows and columns
print("The number of rows in the raw dataframe is", df.shape[0], "and columns is", df.shape[1])

The number of rows in the raw dataframe is 120 and columns is 8


In [101]:
# Split dataframe to X and y variables
X = df.iloc[:,:-2].values
y = df.iloc[:,-2:].values

In [102]:
# Print first three rows of X
print(X[0:3])
print("The shape of X before encoding is:", np.shape(X))

[[35.5 'no' 'yes' 'no' 'no' 'no']
 [35.9 'no' 'no' 'yes' 'yes' 'yes']
 [35.9 'no' 'yes' 'no' 'no' 'no']]
The shape of X before encoding is: (120, 6)


In [103]:
# Print first three rows of y
print(y[0:3])
print("The shape of y before encoding is:", np.shape(y))

[['no' 'no']
 ['yes' 'no']
 ['no' 'no']]
The shape of y before encoding is: (120, 2)


In [104]:
# Encoding categorical data
# Encoding the Independent Variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,MultiLabelBinarizer,LabelEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,2,3,4,5])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X[0:3])
print("The new shape of X is:", np.shape(X))

[[1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 35.5]
 [1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 35.9]
 [1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 1.0 0.0 35.9]]
The new shape of X is: (120, 11)


In [105]:
# Encoding the target variables (Inflammation of urinary bladder and Nephritis of renal pelvis origin)
#lb  = LabelEncoder()
#y[:,0] = lb.fit_transform(y[:,0])
#y[:,1] = lb.fit_transform(y[:,1])
#print(y[0:3])
#print("The new shape of y is:", np.shape(y))
#lb.classes_

In [106]:
# Encoding the target variables (Inflammation of urinary bladder and Nephritis of renal pelvis origin)
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(y)
print(y[0:3])
print("The new shape of y is:", np.shape(y))
mlb.classes_

[[1 0]
 [1 1]
 [1 0]]
The new shape of y is: (120, 2)


array(['no', 'yes'], dtype=object)

In [107]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print('train set shape:', X_train.shape, y_train.shape)
print('test set shape:', X_test.shape, y_test.shape)

train set shape: (96, 11) (96, 2)
test set shape: (24, 11) (24, 2)


In [108]:
# Scale the last column of X_train and X_test
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,-1:] = sc.fit_transform(X_train[:,-1:])
X_test[:,-1:] = sc.transform(X_test[:,-1:])


In [109]:
# print first three rows of scaled X_train 
print(X_train[0:3])


[[0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 1.3153554620019605]
 [1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 1.0 -1.1703556472930816]
 [1.0 0.0 1.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0 -0.5627373761320744]]


In [110]:
# print first three rows of scaled X_test
print(X_test[0:3])

[[1.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 -0.6179754007830722]
 [1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.039165338746956]
 [0.0 1.0 0.0 1.0 1.0 0.0 0.0 1.0 1.0 0.0 1.039165338746956]]


## Model training and testing

### Model 1: KNN algorthim

In [111]:
# Train with KNN 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
clf = MultiOutputClassifier(KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2))
clf.fit(X_train, y_train)


# Predicting the Test set results
y_pred = clf.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),2), y_test.reshape(len(y_test),2)),1))


[[1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [0 1 0 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]


In [112]:
# Checking for accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


1.0

#### Model 1 accuracy (KNN algorthim) is 100%

### Model 2: Logistic regression

In [113]:
# Training the Logistic Regression model on the Training set
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
clf = MultiOutputClassifier(LogisticRegression(random_state = 0))
clf.fit(X_train, y_train)

# Predicting the Test set results
y_pred = clf.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),2), y_test.reshape(len(y_test),2)),1))

[[1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [0 1 0 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]


In [114]:
# Checking for accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

1.0

#### Model 2 accuracy for (Logistic Regression algorthim) is 100%

### Model 3 : Random Forrest Classifier

In [115]:
# Training the random classifier on the Training set
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10, criterion='entropy', max_depth=None, random_state=0)
clf.fit(X_train, y_train)

# Predicting the Test set results
y_pred = clf.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),2), y_test.reshape(len(y_test),2)),1))

[[1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [0 1 0 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]


In [116]:
# Checking for accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

1.0

#### Model 3 accuracy  (random forest classifier) is 100%

## Model 4: Naive-Bayes Classifier

In [117]:
# Training the Naive-Bayes classifier on the Training set
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import GaussianNB
clf = MultiOutputClassifier(GaussianNB())
clf.fit(X_train, y_train)

# Predicting the Test set results
y_pred = clf.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),2), y_test.reshape(len(y_test),2)),1))

[[1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [0 1 0 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 0 1 0]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]


In [118]:
# Checking for accuracy
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

1.0

### Model 4 accuracy (Naive- Bayes) is 100%