# Tuberculosis Prediction

Variables: name string, gender m/f, age range, country_of_birth, smoking 0-4, alcohol 0-4, family history y/n, close contact y/n, health conditions y/n, occupational risks y/n, physical activity 0-4, diet y/n, air pollution 0-4, long term cough y/n, chest pain y/n, appetite loss y/n, weight loss y/n, chills y/n, fatigue y/n, night sweats y/n, cough blood y/n, fevers y/n, no_range


### Conditions

- gender - males
- age range - 15-34, 70-79
- country of birth
- no_range - >25

In [93]:
import pandas as pd

In [94]:
#Load csv
df = pd.read_csv("./dataset/tbdata.csv")

In [95]:
# Create a list of conditions

#symptomatic
conditions11 = [
    (df['Age'].between(15, 34) | df['Age'].between(70, 79)),
    (df['NitricOxide'] > 25),
    (df['CoughBlood'].eq(1))
]

conditions12 = [
    (df['FamilyHistory'].eq(1)),
    (df['CloseContact'].eq(1)),
    (df['HealthConditions'].eq(1)),
    (df['OccupationalRisks'].eq(1)),
    (df['LongTermCough'].eq(1)),
    (df['ChestPain'].eq(1)),
    (df['AppetiteLoss'].eq(1)),
    (df['WeightLoss'].eq(1)),
    (df['Chills'].eq(1)),
    (df['Fatigue'].eq(1)),
    (df['NightSweat'].eq(1)),
    (df['Fevers'].eq(1))
]


#habitual
conditions2 = [
    (df['Gender'].eq(1)),
    (df['Age'].between(15, 34) | df['Age'].between(70, 79)),
    (df['CoughBlood'].eq(1)),
    (df['CountryofBirth'].isin(['India', 'Indonesia', 'China', 'Philippines', 'Pakistan', 'Nigeria', 'Bangladesh', 'DR Congo', 'South Africa', 'Myanmar'])),
    (df['Smoking'] > 2),
    (df['Alcohol'] > 2),
]

#check for nitric oxide
conditions3 = [
    (df['Age'].between(15, 34) | df['Age'].between(70, 79)),
    (df['NitricOxide'] > 25),
    (df['CoughBlood'].eq(1)),
    (df['PhysicalActivity'] < 2),
    (df['AirPollution'] < 3),
    (df[['OccupationalRisks']].eq(1).any(axis=1))

]
#all conditions
conditions4 = [
    (df['Gender'].eq(1)),
    (df['Age'].between(15, 34) | df['Age'].between(70, 79)),
    (df['CoughBlood'].eq(1)),
    (df['CountryofBirth'].isin(['India', 'Indonesia', 'China', 'Philippines', 'Pakistan', 'Nigeria', 'Bangladesh', 'DR Congo', 'South Africa', 'Myanmar'])),
    (df['NitricOxide'] > 25),
    (df['Smoking'] > 2),
    (df['Alcohol'] > 2),
    (df['PhysicalActivity'] < 2),
    (df['AirPollution'] < 3)
]


conditions5 = [
    (df['Age'].between(15, 34) | df['Age'].between(70, 79)),
    (df['FamilyHistory'].eq(1)),
    (df['HealthConditions'].eq(1)),
    (df['LongTermCough'].eq(1)),
    (df['CoughBlood'].eq(1)),
    (df['ChestPain'].eq(1))
]

conditions6 = [
    (df['AppetiteLoss'].eq(1)),
    (df['NitricOxide'] > 25),
    (df['WeightLoss'].eq(1)),
    (df['Chills'].eq(1)),
    (df['Fatigue'].eq(1)),
    (df['NightSweat'].eq(1)),
    (df['Fevers'].eq(1))
]

### PRED1

### PRED2

In [109]:
# Create a new column tbpred based on the conditions
df['tb_pred'] = (((sum(conditions11) > 2) & (sum(conditions12) > 5)) | ((sum(conditions2) >4) & (sum(conditions12) > 5)) | (sum(conditions3) > 4) | ((sum(conditions4) > 4) & (sum(conditions12) > 5))| ((sum(conditions5) > 4) & (sum(conditions6) > 3))).astype(int)


# Display the updated DataFrame
print(df)

                 Name  Gender  Age                    CountryofBirth  Smoking  \
0     Cassandra Smith       0   41                      Cook Islands        2   
1       Kristin Burns       0   86                       Afghanistan        3   
2     Edward Morrison       0   46                           Bermuda        4   
3        Julie Murphy       0   33                           Morocco        4   
4       David Simmons       0   41                           Namibia        1   
...               ...     ...  ...                               ...      ...   
1995     Amber Gibson       0   77                            Bhutan        4   
1996  Danielle Murray       0   91                           Vietnam        0   
1997  Preston Vaughan       1   27                             Haiti        4   
1998    Melissa Smith       1   93                              Niue        4   
1999     Jaime Dodson       0   74  Saint Vincent and the Grenadines        3   

      Alcohol  FamilyHistor

### PRED3

### JUSTIFY PERCENTAGES OF POSITIVES

In [110]:
# Calculate the percentage of True and False predictions
percentage_true = (df['tb_pred'] == 1).sum() / len(df['tb_pred']) * 100
percentage_false = (df['tb_pred'] == 0).sum() / len(df['tb_pred']) * 100

# Display the percentages
print(f"Percentage of True Predictions: {percentage_true:.2f}%")
print(f"Percentage of False Predictions: {percentage_false:.2f}%")

Percentage of True Predictions: 24.05%
Percentage of False Predictions: 75.95%


In [111]:
df.to_csv('./dataset/tbpred.csv', index=False)