 ## Importing necessary libraries

In [74]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder 
from sklearn.ensemble import RandomForestRegressor,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report,confusion_matrix

## Loading our dataset

In [75]:
df=pd.read_csv("COVID_19.csv")

## Analysing our dataset

In [76]:
df.head()

Unnamed: 0,age,gender,Region1,Region2,detected_state,nationality,Travel_hist,Disease_hist,Symptom,Label
0,23,Female,Bebusarai,Bebusarai,Bihar,India,India,Null,Null,Negative
1,41,Female,Balasore,Balasore,Orissa,India,India,Diabetes,Null,Negative
2,21,Female,Erode,Erode,Tamil Nadu,India,India,Null,Fever,Negative
3,55,Male,Gurugram,Gurugram,Haryana,Italy,Italy,BP,Fever,Positive
4,41,Female,Bhilwara,Bhilwara,Rajasthan,India,India,BP,Dry Cough,Positive


In [77]:
df['age'].describe()

count    974.000000
mean      40.962012
std       13.438513
min        2.000000
25%       34.000000
50%       41.000000
75%       41.000000
max       96.000000
Name: age, dtype: float64

In [78]:
df.describe()

Unnamed: 0,age
count,974.0
mean,40.962012
std,13.438513
min,2.0
25%,34.0
50%,41.0
75%,41.0
max,96.0


In [79]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 974 entries, 0 to 973
Data columns (total 10 columns):
age               974 non-null int64
gender            974 non-null object
Region1           974 non-null object
Region2           974 non-null object
detected_state    974 non-null object
nationality       974 non-null object
Travel_hist       974 non-null object
Disease_hist      974 non-null object
Symptom           974 non-null object
Label             974 non-null object
dtypes: int64(1), object(9)
memory usage: 76.2+ KB


## Modifying our dataset according to our needs

In [80]:
for i in range(len(df)):
    if(df['age'][i]>0 and df['age'][i]<20):
        df['age'][i]='young'
    elif(df['age'][i]>=20 and df['age'][i]<50):  
        df['age'][i]='middle'
    else:
        df['age'][i]='old'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [81]:
for j in range(len(df)):
    if(df['Travel_hist'][j]=='India'):
        df['Travel_hist'][j]='twi' #twi=travelled within India
    else:
        df['Travel_hist'][j]='ta'  #ta=travelled abroad

In [82]:
df.head()

Unnamed: 0,age,gender,Region1,Region2,detected_state,nationality,Travel_hist,Disease_hist,Symptom,Label
0,middle,Female,Bebusarai,Bebusarai,Bihar,India,twi,Null,Null,Negative
1,middle,Female,Balasore,Balasore,Orissa,India,twi,Diabetes,Null,Negative
2,middle,Female,Erode,Erode,Tamil Nadu,India,twi,Null,Fever,Negative
3,old,Male,Gurugram,Gurugram,Haryana,Italy,ta,BP,Fever,Positive
4,middle,Female,Bhilwara,Bhilwara,Rajasthan,India,twi,BP,Dry Cough,Positive


## Cleaning our dataset

In [83]:
df.replace("Null",np.nan,inplace=True)
df=df.dropna()
df.head()


Unnamed: 0,age,gender,Region1,Region2,detected_state,nationality,Travel_hist,Disease_hist,Symptom,Label
3,old,Male,Gurugram,Gurugram,Haryana,Italy,ta,BP,Fever,Positive
4,middle,Female,Bhilwara,Bhilwara,Rajasthan,India,twi,BP,Dry Cough,Positive
7,middle,Female,Hyderabad,Hyderabad,Telangana,Indonesia,twi,BP,Dry Cough,Positive
8,middle,Female,Satna,Satna,Madhya Pradesh,India,twi,Diabetes,Sore Throat,Negative
10,middle,Male,Pune,Pune,Maharashtra,India,twi,BP,Fever,Positive


In [84]:
df.drop(df.columns[[2,3,4,5]],axis=1,inplace=True)
df.head()

Unnamed: 0,age,gender,Travel_hist,Disease_hist,Symptom,Label
3,old,Male,ta,BP,Fever,Positive
4,middle,Female,twi,BP,Dry Cough,Positive
7,middle,Female,twi,BP,Dry Cough,Positive
8,middle,Female,twi,Diabetes,Sore Throat,Negative
10,middle,Male,twi,BP,Fever,Positive


## Analysing our dataset after cleaning

In [85]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 3 to 970
Data columns (total 6 columns):
age             506 non-null object
gender          506 non-null object
Travel_hist     506 non-null object
Disease_hist    506 non-null object
Symptom         506 non-null object
Label           506 non-null object
dtypes: object(6)
memory usage: 27.7+ KB


In [86]:
df.describe()

Unnamed: 0,age,gender,Travel_hist,Disease_hist,Symptom,Label
count,506,506,506,506,506,506
unique,2,2,2,2,4,2
top,middle,Male,ta,BP,Dry Cough,Positive
freq,365,261,301,318,280,452


## Final Dataset

In [87]:
df.head()

Unnamed: 0,age,gender,Travel_hist,Disease_hist,Symptom,Label
3,old,Male,ta,BP,Fever,Positive
4,middle,Female,twi,BP,Dry Cough,Positive
7,middle,Female,twi,BP,Dry Cough,Positive
8,middle,Female,twi,Diabetes,Sore Throat,Negative
10,middle,Male,twi,BP,Fever,Positive


## Spliting the dataset into features and label

In [88]:
df=df.values
x=df[:,0:5]
x.shape

(506, 5)

In [89]:
y=df[:,5]

## Encoding our data

In [90]:
ohe=OneHotEncoder(sparse=False)
le=LabelEncoder()
x0=ohe.fit_transform(x[:,0].reshape(-1,1))
x1=ohe.fit_transform(x[:,1].reshape(-1,1))
x2=ohe.fit_transform(x[:,2].reshape(-1,1))
x3=ohe.fit_transform(x[:,3].reshape(-1,1))
x4=ohe.fit_transform(x[:,4].reshape(-1,1))
y=le.fit_transform(y.reshape(-1,1))

X=np.hstack((x0,x1,x2,x3,x4))

X.shape


  y = column_or_1d(y, warn=True)


(506, 12)

## Splitting the data into training and testing data

In [91]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

## Feeding the data into different machine learning algoritms

## Random Forest Regressor

In [92]:
rf = RandomForestRegressor(n_estimators = 150) 
rf.fit(X_train, y_train) 
rf.score(X_test,y_test)

0.878338344773593

## Logistic Regression

In [93]:
lr=LogisticRegression(penalty='l2',random_state=42)
lr.fit(X_train,y_train)
lr.score(X_test,y_test)



0.9640718562874252

## Linear Support Vector Machine

In [94]:
svml=SVC(kernel='linear')
svml.fit(X_train,y_train)
svml.score(X_test,y_test)

0.9520958083832335

## Non-Linear Support Vector Machine

In [95]:
svmnl=SVC(kernel='rbf')
svmnl.fit(X_train,y_train)
svmnl.score(X_test,y_test)



0.9880239520958084

## Decision Tree

In [96]:
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
dt.score(X_test,y_test)

0.9880239520958084

## K Neighbors Classifier

In [97]:
model=KNeighborsClassifier(n_neighbors=1)
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.9880239520958084

##  AdaBoost Classifier

In [98]:
ad = AdaBoostClassifier(n_estimators=100)
ad.fit(X_train,y_train)
ad.score(X_test,y_test)


0.9880239520958084

In [99]:
y_predict=ad.predict(X_test)

## We are getting maximum accuracies for algoritms like SVM , AdaBoost , Decision Tree and KNN.

## Classification Report for AdaBoost algorithm

In [100]:
report=classification_report(y_test,y_predict,target_names=['Positive','Negative'])
print(report)

              precision    recall  f1-score   support

    Positive       0.95      0.95      0.95        20
    Negative       0.99      0.99      0.99       147

   micro avg       0.99      0.99      0.99       167
   macro avg       0.97      0.97      0.97       167
weighted avg       0.99      0.99      0.99       167



## Confusion Matrix for AdaBoost algorithm

In [101]:
cf=confusion_matrix(y_test,y_predict,labels=[0,1])
print(cf)

[[ 19   1]
 [  1 146]]
