In [89]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import precision_score,recall_score,f1_score,confusion_matrix,roc_auc_score

In [90]:
df=pd.read_csv(r"D:\covid_prediction\data\covid.csv")

In [91]:
x=df.drop("Has_Covid",axis=1)
y=df["Has_Covid"]

In [92]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [93]:
oe=joblib.load(r"D:\covid_prediction\models\cough_encoder.pkl")
le_gender=joblib.load(r"D:\covid_prediction\models\gender_encoder.pkl")
le_target=joblib.load(r"D:\covid_prediction\models\target_encoder.pkl")
scaler=joblib.load(r"D:\covid_prediction\models\scaler.pkl")
city_columns=joblib.load(r"D:\covid_prediction\models\city_column.pkl")
feature_names=joblib.load(r"D:\covid_prediction\models\feature_names.pkl")

In [94]:
X_train_processed=X_train.copy()
X_test_processed=X_test.copy()

In [95]:
X_train_processed['Cough'] = oe.transform(X_train[["Cough"]])
X_test_processed['Cough'] = oe.transform(X_test[["Cough"]])

In [96]:
X_train_processed['Gender'] = le_gender.transform(X_train['Gender'])
X_test_processed['Gender'] = le_gender.transform(X_test['Gender'])

In [97]:
y_train_encoded = le_target.transform(y_train)
y_test_encoded = le_target.transform(y_test)

In [98]:
X_train_city = pd.get_dummies(X_train['City'], prefix='City')
X_test_city = pd.get_dummies(X_test['City'], prefix='City')

In [99]:
X_train_city = X_train_city.reindex(columns=city_columns, fill_value=0)
X_test_city = X_test_city.reindex(columns=city_columns, fill_value=0)

In [100]:
X_train_processed = X_train_processed.drop('City', axis=1)
X_test_processed = X_test_processed.drop('City', axis=1)

In [101]:
X_train_processed = pd.concat([X_train_processed, X_train_city], axis=1)
X_test_processed = pd.concat([X_test_processed, X_test_city], axis=1)

In [102]:
num_cols = ['Age', 'Fever']
X_train_processed[num_cols] = scaler.transform(X_train_processed[num_cols])
X_test_processed[num_cols] = scaler.transform(X_test_processed[num_cols])

In [103]:
X_train_processed = X_train_processed.astype(float)
X_test_processed = X_test_processed.astype(float)

In [104]:
X_train_processed = X_train_processed.loc[:, ~X_train_processed.columns.duplicated()]
X_test_processed = X_test_processed.loc[:, ~X_test_processed.columns.duplicated()]

In [105]:
X_train_processed = X_train_processed.reindex(columns=feature_names, fill_value=0)
X_test_processed = X_test_processed.reindex(columns=feature_names, fill_value=0)

In [106]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(random_state=42)

In [107]:
lr.fit(X_train_processed,y_train_encoded)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [108]:
y_pred_lr=lr.predict(X_test_processed)

In [109]:
prec_lr=precision_score(y_test_encoded,y_pred_lr)
rec_lr=recall_score(y_test_encoded,y_pred_lr)
f1_lr=f1_score(y_test_encoded,y_pred_lr)

In [110]:
prec_lr,rec_lr,f1_lr

(0.4722222222222222, 0.3541666666666667, 0.40476190476190477)

In [111]:
from sklearn.ensemble import RandomForestClassifier

In [112]:
rf=RandomForestClassifier(
    n_estimators=300,
    max_depth=3,
    random_state=42,
    n_jobs=-1
)

In [113]:
rf.fit(X_train_processed,y_train_encoded)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [114]:
y_pred_rf=rf.predict(X_test_processed)

In [115]:
prec_rf=precision_score(y_test_encoded,y_pred_rf)
rec_rf=recall_score(y_test_encoded,y_pred_rf)
f1_rf=f1_score(y_test_encoded,y_pred_rf)

In [116]:
prec_rf,rec_rf,f1_rf

(0.48717948717948717, 0.3958333333333333, 0.4367816091954023)

In [117]:
from xgboost import XGBClassifier

In [189]:
xgb= XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    scale_pos_weight=2.0,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss",
    n_jobs=-1
)

In [190]:
xgb.fit(X_train_processed,y_train_encoded)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [191]:
y_pred_xgb=xgb.predict(X_test_processed)

In [192]:
prec_xgb=precision_score(y_test_encoded,y_pred_xgb)
rec_xgb=recall_score(y_test_encoded,y_pred_xgb)
f1_xgb=f1_score(y_test_encoded,y_pred_xgb)

In [193]:
prec_xgb,rec_xgb,f1_xgb

(0.47540983606557374, 0.6041666666666666, 0.5321100917431193)

In [194]:
model_path=r"D:\covid_prediction\models\xgb_model.pkl"
joblib.dump(xgb,model_path)

['D:\\covid_prediction\\models\\xgb_model.pkl']