In [2]:
import pandas as pd

In [3]:
train_df = pd.read_csv('train_data.csv')
train_df.head(5)

Unnamed: 0,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,Pains,Nasal-Congestion,Runny-Nose,Diarrhea,Age_10-19,Age_20-24,Age_25-59,Age_60+,Gender_Male,Gender_Transgender,Country,Target
0,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0.0,mild
1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0.0,mild
2,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0.0,moderate
3,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0.0,moderate
4,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0.0,moderate


In [4]:
train_df.shape

(253440, 17)

In [5]:
train_df.Target.value_counts()

severe      63455
mild        63395
moderate    63314
none        63276
Name: Target, dtype: int64

In [6]:
test_df = pd.read_csv('test_data.csv')
test_df.head(5)

Unnamed: 0,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,Pains,Nasal-Congestion,Runny-Nose,Diarrhea,Age_10-19,Age_20-24,Age_25-59,Age_60+,Gender_Male,Gender_Transgender,Country,Target
0,0,1,1,1,1,1,1,1,1,1,0,0,0,1,0,0.0,mild
1,0,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0.222222,none
2,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0.555556,severe
3,1,1,1,1,1,0,1,1,0,0,1,0,0,1,0,0.444444,severe
4,1,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0.222222,none


In [7]:
train_df.Target.replace({'mild':'risk','moderate':'risk','severe':'risk'}, inplace=True)

In [8]:
test_df.Target.replace({'mild':'risk','moderate':'risk','severe':'risk'}, inplace=True)

In [9]:
train_df.Target.value_counts(dropna=False)

risk    190164
none     63276
Name: Target, dtype: int64

In [10]:
test_df.Target.value_counts(dropna=False)

risk    47436
none    15924
Name: Target, dtype: int64

In [11]:
from imblearn.over_sampling import SMOTE
Xt, yt = SMOTE().fit_resample(train_df.drop('Target',axis=1), train_df.Target)

In [12]:
yt.value_counts()

none    190164
risk    190164
Name: Target, dtype: int64

In [13]:
smote_data = pd.concat([Xt, yt], axis=1)
smote_data.head(2)

Unnamed: 0,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,Pains,Nasal-Congestion,Runny-Nose,Diarrhea,Age_10-19,Age_20-24,Age_25-59,Age_60+,Gender_Male,Gender_Transgender,Country,Target
0,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0.0,risk
1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,0,0.0,risk


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression as LR
from sklearn.tree import DecisionTreeClassifier as DT
from sklearn.ensemble import RandomForestClassifier as RF
from xgboost import XGBClassifier as XGB
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.ensemble import GradientBoostingClassifier as GBC

In [15]:
X_train, X_test, y_train, y_test = train_test_split(smote_data.drop(columns='Target'), 
                                                     smote_data.Target, test_size=0.2,
                                                    random_state=101)

In [16]:
model_lr = LR().fit(X_train, y_train)
predictions = model_lr.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[17081 21114]
 [16951 20920]]
              precision    recall  f1-score   support

        none       0.50      0.45      0.47     38195
        risk       0.50      0.55      0.52     37871

    accuracy                           0.50     76066
   macro avg       0.50      0.50      0.50     76066
weighted avg       0.50      0.50      0.50     76066



In [17]:
model_nb = GNB().fit(X_train, y_train)
predictions = model_nb.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[18501 19694]
 [18237 19634]]
              precision    recall  f1-score   support

        none       0.50      0.48      0.49     38195
        risk       0.50      0.52      0.51     37871

    accuracy                           0.50     76066
   macro avg       0.50      0.50      0.50     76066
weighted avg       0.50      0.50      0.50     76066



In [18]:
model_dt = DT().fit(X_train, y_train)
predictions = model_dt.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[20864 17331]
 [ 9096 28775]]
              precision    recall  f1-score   support

        none       0.70      0.55      0.61     38195
        risk       0.62      0.76      0.69     37871

    accuracy                           0.65     76066
   macro avg       0.66      0.65      0.65     76066
weighted avg       0.66      0.65      0.65     76066



In [19]:
model_rf = RF().fit(X_train, y_train)
predictions = model_rf.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[17415 20780]
 [ 7177 30694]]
              precision    recall  f1-score   support

        none       0.71      0.46      0.55     38195
        risk       0.60      0.81      0.69     37871

    accuracy                           0.63     76066
   macro avg       0.65      0.63      0.62     76066
weighted avg       0.65      0.63      0.62     76066



In [20]:
model_xgb = XGB().fit(X_train, y_train)
predictions = model_xgb.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))



[[17170 21025]
 [    9 37862]]
              precision    recall  f1-score   support

        none       1.00      0.45      0.62     38195
        risk       0.64      1.00      0.78     37871

    accuracy                           0.72     76066
   macro avg       0.82      0.72      0.70     76066
weighted avg       0.82      0.72      0.70     76066



In [21]:
model_gbc = GBC().fit(X_train, y_train)
predictions = model_gbc.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[17166 21029]
 [    0 37871]]
              precision    recall  f1-score   support

        none       1.00      0.45      0.62     38195
        risk       0.64      1.00      0.78     37871

    accuracy                           0.72     76066
   macro avg       0.82      0.72      0.70     76066
weighted avg       0.82      0.72      0.70     76066



### Applying on actual Test Data set

In [28]:
pred = model_lr.predict(test_df.drop(columns='Target'))

print(f1_score(test_df.Target, pred, labels=['risk','none'], average=None))
confusion_matrix(test_df.Target, pred, labels=['risk','none'])

[0.63663424 0.31867683]


array([[26307, 21129],
       [ 8901,  7023]], dtype=int64)

In [29]:
pred = model_nb.predict(test_df.drop(columns='Target'))

print(f1_score(test_df.Target, pred, labels=['risk','none'], average=None))
confusion_matrix(test_df.Target, pred, labels=['risk','none'])

[0.61557374 0.32970316]


array([[24787, 22649],
       [ 8310,  7614]], dtype=int64)

In [30]:
pred = model_dt.predict(test_df.drop(columns='Target'))

print(f1_score(test_df.Target, pred, labels=['risk','none'], average=None))
confusion_matrix(test_df.Target, pred, labels=['risk','none'])

[0.72623932 0.03554984]


array([[35841, 11595],
       [15426,   498]], dtype=int64)

In [31]:
pred = model_rf.predict(test_df.drop(columns='Target'))

print(f1_score(test_df.Target, pred, labels=['risk','none'], average=None))
confusion_matrix(test_df.Target, pred, labels=['risk','none'])

[0.75502183 0.02576388]


array([[38226,  9210],
       [15596,   328]], dtype=int64)

In [32]:
pred = model_xgb.predict(test_df.drop(columns='Target'))

print(f1_score(test_df.Target, pred, labels=['risk','none'], average=None))
confusion_matrix(test_df.Target, pred, labels=['risk','none'])

[0.85608023 0.        ]


array([[47417,    19],
       [15924,     0]], dtype=int64)

In [33]:
pred = model_gbc.predict(test_df.drop(columns='Target'))

print(f1_score(test_df.Target, pred, labels=['risk','none'], average=None))
confusion_matrix(test_df.Target, pred, labels=['risk','none'])

[0.8562764 0.       ]


array([[47436,     0],
       [15924,     0]], dtype=int64)

In [36]:
model_lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [46]:
from sklearn.model_selection import GridSearchCV
grid_s_cv = GridSearchCV(estimator=LR(),
                        param_grid={'C':[1,10,0.0001, 0.00001],
                                    'multi_class':['multinomial'],
                                    'penalty':['l2'],
                                    'max_iter':[20, 50, 80, 100, 120]
                                   }).fit(X_train, y_train)

In [47]:
pd.DataFrame(grid_s_cv.cv_results_)[['param_C','param_multi_class','param_penalty','param_max_iter', 'mean_test_score']]

Unnamed: 0,param_C,param_multi_class,param_penalty,param_max_iter,mean_test_score
0,1.0,multinomial,l2,20,0.49881
1,1.0,multinomial,l2,50,0.49881
2,1.0,multinomial,l2,80,0.49881
3,1.0,multinomial,l2,100,0.49881
4,1.0,multinomial,l2,120,0.49881
5,10.0,multinomial,l2,20,0.498823
6,10.0,multinomial,l2,50,0.498823
7,10.0,multinomial,l2,80,0.498823
8,10.0,multinomial,l2,100,0.498823
9,10.0,multinomial,l2,120,0.498823


In [48]:
grid_s_cv.best_params_

{'C': 1e-05, 'max_iter': 20, 'multi_class': 'multinomial', 'penalty': 'l2'}

In [50]:
model_final = LR(C=1e-05, max_iter= 20, 
                 multi_class= 'multinomial', penalty= 'l2').fit(X_train,y_train)

In [53]:
pred = model_final.predict(X_test)
confusion_matrix(y_test, pred, labels=['risk','none'])

array([[28175,  9696],
       [28475,  9720]], dtype=int64)

In [54]:
pred = model_final.predict(test_df.drop(columns='Target'))

print(f1_score(test_df.Target, pred, labels=['risk','none'], average=None))
confusion_matrix(test_df.Target, pred, labels=['risk','none'])

[0.74824402 0.247237  ]


array([[35527, 11909],
       [11998,  3926]], dtype=int64)