## Data import & explanation

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate
from sklearn.metrics import  f1_score, make_scorer
import seaborn as sns

In [None]:
df = pd.read_csv('Covid Data.csv')
df

## Data explanation:

    usmr: Indicates whether the patient treated medical units of the first, second or third level.
    medical unit: type of institution of the National Health System that provided the care.
    sex: female or male
    date_died: indicates whether the patient died or recovered.
    patient type: hospitalized or not hospitalized.
    intubed: whether the patient was connected to the ventilator.
    pneumonia: whether the patient already have air sacs inflammation or not.
    age: of the patient.
    pregnancy: whether the patient is pregnant or not.
    diabetes: whether the patient has diabetes or not.
    copd: Indicates whether the patient has Chronic obstructive pulmonary disease or not.
    asthma: whether the patient has asthma or not.
    inmsupr: whether the patient is immunosuppressed or not.
    hypertension: whether the patient has hypertension or not.
    other disease: whether the patient has other disease or not.
    cardiovascular: whether the patient has heart or blood vessels related disease.
    obesity: whether the patient is obese or not.
    renal chronic: whether the patient has chronic renal disease or not.
    tobacco: whether the patient is a tobacco user.
    classification: covid test findings. Values 1-3 mean that the patient was diagnosed with covid in different degrees. 4 or higher means that the patient is not a carrier of covid or that the test is inconclusive.
    icu: Indicates whether the patient had been admitted to an Intensive Care Unit.
    
##### The dataset was provided by the Mexican government. This dataset contains an enormous number of anonymized patient-related information including pre-conditions. In the Boolean features, 1 means "yes" and 2 means "no". values as 97 and 99 are missing data.

In [None]:
df.info()
#All columns are correctly identified as int except the Date_died column which is an object. 

In [None]:
df.describe()

In [None]:
df.describe(include= 'O')

In [None]:
df[df.duplicated()== True]

In [None]:
#Dupicates check:
df.duplicated().sum()
#The duplicates were preserved as they were statistically relevant

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

In [None]:
for col in df.columns:

    fig = px.histogram(df, x= col)
    fig.show()

## Data Pre-processing

In [None]:
#Adjusting column heads to be lower case
columns = df.columns
columns = [x.lower() for x in columns]
df.columns = columns
df.columns

In [None]:
#Adding a new column instead of date_died as its of no benifit since I dont have other time variables. New column will be died which will be 1 for death and 2 for alive
df[df.date_died == "9999-99-99"]
df['died'] = np.where(df.date_died == "9999-99-99", 2, 1)
df

In [None]:
df.drop('date_died', inplace= True, axis = 1)

In [None]:
df

In [None]:
#imputing some values for pregnant column as men can't be pregnant
df.loc[df.sex == 2, 'pregnant'] = 2

In [None]:
df[df['pregnant'] == 98]

In [None]:
#droping the remaining unknows for pregnant ladies
df.drop(df[df['pregnant']== 98].index, inplace = True, axis = 0)

In [None]:
df

In [None]:
df[(df['patient_type'] == 2) & (df['icu'] == 1)]

In [None]:
#in the aim of preserving as much data as possible, we will assume that patients who have been intubated have entered the ICU. 
df.loc[df.patient_type == 1, 'icu'] = 2
df.loc[df.patient_type == 1, 'intubed'] = 2

In [None]:
df[(df['intubed'] == 99) & (df['icu'] == 99)]

In [None]:
df.columns

In [None]:
#removing all the remaining missing values from the df
df = df[(df.intubed == 1) | (df.intubed == 2)]
df = df[(df.pneumonia == 1) | (df.pneumonia == 2)]
df = df[(df.pregnant == 1) | (df.pregnant == 2)]
df = df[(df.diabetes == 1) | (df.diabetes == 2)]
df = df[(df.copd == 1) | (df.copd == 2)]
df = df[(df.asthma == 1) | (df.asthma == 2)]
df = df[(df.inmsupr == 1) | (df.inmsupr == 2)]
df = df[(df.hipertension == 1) | (df.hipertension == 2)]
df = df[(df.other_disease == 1) | (df.other_disease == 2)]
df = df[(df.cardiovascular == 1) | (df.cardiovascular == 2)]
df = df[(df.obesity == 1) | (df.obesity == 2)]
df = df[(df.renal_chronic == 1) | (df.renal_chronic == 2)]
df = df[(df.tobacco == 1) | (df.tobacco == 2)]
df = df[(df.icu == 1) | (df.icu == 2)]

In [None]:
df.shape

In [None]:
for cols in df.columns:
    print(f'Column: {cols}\nValue Counts:\n{df[cols].value_counts()}\n') 


## Data Analysis

In [None]:
df.columns

#### Smoking impact on classification

In [None]:
ana1 = df.loc[df['tobacco'] == 1].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='tobacco')
px.bar(ana1, x= 'clasiffication_final', y = 'tobacco')

In [None]:
ana1 = df.loc[df['tobacco'] == 2].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='tobacco')
px.bar(ana1, x= 'clasiffication_final', y = 'tobacco')

### Pneumonia as a symptom of covid

In [None]:
ana1 = df.loc[df['pneumonia'] == 1].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='pneumonia')
px.bar(ana1, x= 'clasiffication_final', y = 'pneumonia')

### Impact of obesity on covid classification

In [None]:
ana1 = df.loc[df['obesity'] == 1].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='obesity')
px.bar(ana1, x= 'clasiffication_final', y = 'obesity')

In [None]:
ana1 = df.loc[df['obesity'] == 2].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='obesity')
px.bar(ana1, x= 'clasiffication_final', y = 'obesity')

### Impact of COPD on covid classification

In [None]:
ana1 = df.loc[df['copd'] == 1].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='copd')
px.bar(ana1, x= 'clasiffication_final', y = 'copd')

### Impact of ICU on covid classification

In [None]:
ana1 = df.loc[df['icu'] == 1].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='icu')
px.bar(ana1, x= 'clasiffication_final', y = 'icu')

### Amount of people entered in the ICU 

In [None]:
ana1 = df.loc[df['patient_type'] == 2].groupby('icu').size().sort_values(ascending=False).reset_index(name='patient_type')
px.bar(ana1, x= 'icu', y = 'patient_type')

### Impact of cardiovascular on covid classification

In [None]:
ana1 = df.loc[df['cardiovascular'] == 1].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='cardiovascular')
px.bar(ana1, x= 'clasiffication_final', y = 'cardiovascular')

### Impact of asthma on covid classification

In [None]:
ana1 = df.loc[df['asthma'] == 1].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='asthma')
px.bar(ana1, x= 'clasiffication_final', y = 'asthma')

### Impact of Immuno suppressed on covid classification

In [None]:
ana1 = df.loc[df['inmsupr'] == 1].groupby('clasiffication_final').size().sort_values(ascending=False).reset_index(name='inmsupr')
px.bar(ana1, x= 'clasiffication_final', y = 'inmsupr')

### Ratio of asthma patients that entered the ICU

In [None]:
ana1 = df.loc[df['asthma'] == 1].groupby('icu').size().sort_values(ascending=False).reset_index(name='asthma')
px.bar(ana1, x= 'icu', y = 'asthma')

### ICU patients with other parameters (Parameter set to 1)

In [None]:
df2=df #For testing and backup

In [None]:
conditions = ['sex', 'intubed', 'pneumonia',
       'age', 'pregnant', 'diabetes', 'copd', 'asthma', 'inmsupr',
       'hipertension', 'other_disease', 'cardiovascular', 'obesity',
       'renal_chronic', 'tobacco','died']
results = {}
for condition in conditions:
       filtered_df = df.loc[df[condition] == 1]
       grouped = filtered_df.groupby('icu').size().sort_values(ascending=False).reset_index(name=f'{condition}')
       results[f'{condition}'] = grouped
       

conditions = list(results.keys())[:-1]  
fig, ax = plt.subplots(figsize=(10, 6))
colors = ['#FF0000', '#00FF00', '#0000FF', '#FFFF00', '#FF00FF', '#00FFFF', '#800000', '#008000', '#000080', '#808000', '#800080', '#008080', '#400000', '#004000', '#000040']


for i, condition in enumerate(conditions):
       filtered_df = df[(df[condition] == 1) & (df['icu'] == 1)]
       count = filtered_df.shape[0]
       ax.bar(condition, count, color=colors[i % len(colors)])


ax.set_title('Frequency of Conditions in ICU Groups')
ax.set_xlabel('Condition')
ax.set_ylabel('Frequency')
plt.xticks(rotation=45)  
plt.tight_layout()

plt.show()

## Data preprocessing

In [None]:
#In order to simplify the target, instead of having ordinal values from 1-7, we wil assume 1-3 to be high risk and 4-7 low risk of infection for covid-19
df['high_risk'] = np.where((df['clasiffication_final'] >= 1) & (df['clasiffication_final'] <= 3), 1, 0) #1 is for high risk & 0 is for low risk
df

In [None]:
df.drop('clasiffication_final', inplace= True, axis = 1)

In [None]:
df

In [None]:
#Setting all values that are equal 2 to be 0
paramaters= ['sex', 'intubed', 'pneumonia','patient_type',
       'pregnant', 'diabetes', 'copd', 'asthma', 'inmsupr',
       'hipertension', 'other_disease', 'cardiovascular', 'obesity',
       'renal_chronic', 'tobacco','icu','died']
for parameter in paramaters:
    df[parameter] = df[parameter].replace(2,0)


In [None]:
df

In [None]:
x = df.drop('high_risk', axis= 1)
y = df['high_risk']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 28, stratify= y)
y_train.value_counts(normalize= True) * 100

In [None]:
y_test.value_counts(normalize= True) * 100

In [None]:
y_train

In [None]:
sc = StandardScaler()
scaling_cols = ['age' ]

x_train[scaling_cols] = sc.fit_transform(x_train[scaling_cols])

x_test[scaling_cols] = sc.transform(x_test[scaling_cols])

In [None]:
x_train

In [None]:
ohe = OneHotEncoder(drop= 'first', sparse_output= False)

x_train_ohe = ohe.fit_transform(x_train[['medical_unit']])

x_test_ohe = ohe.transform(x_test[['medical_unit']])

In [None]:
x_train_ohe = pd.DataFrame(x_train_ohe, columns= ohe.get_feature_names_out())

x_test_ohe = pd.DataFrame(x_test_ohe, columns= ohe.get_feature_names_out())

In [None]:
x_train.reset_index(drop= True, inplace= True)
x_test.reset_index(drop= True, inplace= True)


In [None]:
x_train = pd.concat([x_train, x_train_ohe], axis= 1).drop(['medical_unit'], axis= 1)

x_test = pd.concat([x_test, x_test_ohe], axis= 1).drop(['medical_unit'], axis= 1)

## Machine Learning

In [None]:
lr = LogisticRegression(max_iter=1000, solver='saga')

lr.fit(x_train, y_train)

print(f'Training Score : \n', classification_report(y_train, lr.predict(x_train)))

print(f'Test Score : \n', classification_report(y_test, lr.predict(x_test)))

In [None]:
knn = KNeighborsClassifier()

knn.fit(x_train, y_train)

print(f'Training Score : \n', classification_report(y_train, knn.predict(x_train)))

print(f'Test Score : \n', classification_report(y_test, knn.predict(x_test)))

In [None]:
rf = RandomForestClassifier(max_depth= 6)

rf.fit(x_train, y_train)

print(f'Training Score : \n', classification_report(y_train, rf.predict(x_train)))

print(f'Test Score : \n', classification_report(y_test, rf.predict(x_test)))

In [None]:
y_test

In [None]:
xgb = XGBClassifier()

xgb.fit(x_train, y_train)

print(f'Training Score : \n', classification_report(y_train, xgb.predict(x_train)))

print(f'Test Score : \n', classification_report(y_test, xgb.predict(x_test)))

In [None]:
from sklearn.metrics import  f1_score, make_scorer
custom_scorer = make_scorer(f1_score, average = 'macro')
cv = cross_validate(XGBClassifier(), x_train, y_train,  scoring={'f1_macro': custom_scorer}, cv= 10, return_train_score= True)

In [None]:
cv.items()

In [None]:
print(cv['test_f1_macro'] * 100)
print(cv['train_f1_macro'] * 100)

In [None]:
print(cv['test_f1_macro'].mean() * 100)
print(cv['train_f1_macro'].mean() * 100)

#### This low score maybe attributable to imbalance which I will attempt to solve using SMOTE library

In [None]:
smote = SMOTE()

x_train, y_train = smote.fit_resample(x_train, y_train)

In [None]:
x_train

In [None]:
y_train

In [None]:
xgb = XGBClassifier()

xgb.fit(x_train, y_train)

print(f'Training Score : \n', classification_report(y_train, xgb.predict(x_train)))

print(f'Test Score : \n', classification_report(y_test, xgb.predict(x_test)))

In [None]:
custom_scorer = make_scorer(f1_score, average = 'macro')
cv = cross_validate(XGBClassifier(), x_train, y_train,  scoring={'f1_macro': custom_scorer}, cv= 10, return_train_score= True)

In [None]:
print(cv['test_f1_macro'].mean() * 100)
print(cv['train_f1_macro'].mean() * 100)

In [None]:
cv = cross_validate(RandomForestClassifier(max_depth= 6), x_train, y_train,  scoring={'f1_macro': custom_scorer}, cv= 10, return_train_score= True)

In [None]:
print(cv['test_f1_macro'].mean() * 100)
print(cv['train_f1_macro'].mean() * 100)


### Conclusion:

Results show that low accuracy was obtained from these models. One way to reduce inaccuracies is to test the correlation with the high_risk column
to remove the irrelavant columns.

In [None]:
plt.figure(figsize=(14, 9))
sns.heatmap(df.corr(), annot=True, cmap="viridis")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
x_train.columns

In [None]:
#we will assume all correlation values less than 0.1 are having low impact on high_risk column except ones higher than 0.1 in died column
columns_dropped = ['sex', 'pregnant', 'asthma', 'inmsupr','other_disease', 'cardiovascular', 'obesity','tobacco']
x_train.drop(columns_dropped, axis= 1)
x_test.drop(columns_dropped, axis= 1)

In [None]:
cv = cross_validate(RandomForestClassifier(max_depth= 6), x_train, y_train,  scoring={'f1_macro': custom_scorer}, cv= 10, return_train_score= True)

In [None]:
print(cv['test_f1_macro'].mean() * 100)
print(cv['train_f1_macro'].mean() * 100)

In [None]:
cv = cross_validate(LogisticRegression(max_iter=1000, solver='saga'), x_train, y_train,  scoring={'f1_macro': custom_scorer}, cv= 10, return_train_score= True)

In [None]:
print(cv['test_f1_macro'].mean() * 100)
print(cv['train_f1_macro'].mean() * 100)

### The accuracy reached is the maximum for this dataset using the above methods. Removing some columns with low correlation didn't result in significant accuracy improvements.