In [1]:
import numpy as np
import pandas as pd


In [3]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [20]:
# Paste in Colab. Creates cold_dataset.csv (10,000 rows) with missing values in fever column.

import random
import numpy as np
import pandas as pd

random.seed(42)
np.random.seed(42)

cities = [
    "Delhi","Mumbai","Bengaluru","Kolkata","Chennai","Lucknow"
]
genders = ["Male", "Female"]
cough_levels = ["low", "mild", "high"]

def generate_row():
    age = random.randint(1, 80)
    gender = random.choice(genders)
    city = random.choice(cities)

    fever = round(random.uniform(97.0, 103.5), 1)

    cough = random.choices(
        cough_levels,
        weights=[45, 35, 20],
        k=1
    )[0]

    # correlated label (not random)
    p = 0.08
    if fever >= 100.4:
        p += 0.35
    elif fever >= 99.5:
        p += 0.18

    if cough == "high":
        p += 0.35
    elif cough == "mild":
        p += 0.18
    else:
        p += 0.05

    if age <= 10 or age >= 60:
        p += 0.07

    p = max(0.0, min(0.95, p))
    has_cold = 1 if random.random() < p else 0

    return age, gender, fever, cough, city, has_cold

rows = 10000
data = [generate_row() for _ in range(rows)]
df = pd.DataFrame(data, columns=["age","gender","fever","cough","city","has_cold"])

# Insert missing values in fever column (e.g., 8% missing)
missing_rate = 0.08
mask = np.random.rand(len(df)) < missing_rate
df.loc[mask, "fever"] = np.nan

# Save + quick checks
df.to_csv("cold_dataset.csv", index=False)

print("Saved cold_dataset.csv")
print("Total rows:", len(df))
print("Missing fever count:", df["fever"].isna().sum())
df.head()


Saved cold_dataset.csv
Total rows: 10000
Missing fever count: 830


Unnamed: 0,age,gender,fever,cough,city,has_cold
0,15,Male,98.8,low,Lucknow,0
1,70,Male,99.7,low,Chennai,1
2,65,Male,98.3,mild,Chennai,0
3,29,Female,98.8,high,Chennai,0
4,21,Female,98.8,low,Bengaluru,0


In [21]:
df['city'].value_counts()

Unnamed: 0_level_0,count
city,Unnamed: 1_level_1
Kolkata,1716
Lucknow,1700
Mumbai,1687
Delhi,1683
Chennai,1614
Bengaluru,1600


In [22]:
df.isnull().sum()

Unnamed: 0,0
age,0
gender,0
fever,830
cough,0
city,0
has_cold,0


In [23]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [24]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .25, random_state = 41)

In [25]:
x_train

Unnamed: 0,age,gender,fever,cough,city
5556,26,Female,102.9,low,Lucknow
6922,56,Female,102.2,low,Chennai
2933,39,Female,102.1,mild,Kolkata
1642,65,Male,101.9,mild,Bengaluru
8184,9,Female,101.2,mild,Kolkata
...,...,...,...,...,...
8513,7,Male,98.1,high,Delhi
5200,14,Female,100.6,low,Lucknow
4066,70,Male,102.8,low,Delhi
931,20,Male,103.0,mild,Bengaluru


# **aam zindagi**

In [26]:
# adding simple imputer to fever col
# replaces the missing values with mean/avg

si = SimpleImputer()
x_train_fever = si.fit_transform(x_train[['fever']])

# also the test data
x_test_fever = si.fit_transform(x_test[['fever']])
x_train_fever.shape

(7500, 1)

In [27]:
# ordinal encoding on cough
oe = OrdinalEncoder(categories = [['low', 'mild', 'high']])
x_train_cough = oe.fit_transform(x_train[['cough']])

x_test_cough = oe.fit_transform(x_test[['cough']])
x_train_cough.shape

(7500, 1)

In [28]:
x_test_cough

array([[0.],
       [2.],
       [1.],
       ...,
       [2.],
       [0.],
       [0.]])

In [30]:
# onehot encoding on gender and city

ohe = OneHotEncoder(drop = 'first', sparse_output = False)
x_train_gender_city = ohe.fit_transform(x_train[['gender', 'city']])

# also the test data
x_test_gender_city = ohe.fit_transform(x_test[['gender', 'city']])

x_train_gender_city.shape

(7500, 6)

In [31]:
# extracting age
x_train_age = x_train.drop(columns = ['gender', 'fever', 'cough', 'city']).values

x_test_age = x_test.drop(columns = ['gender', 'fever', 'cough', 'city']).values
x_train_age.shape

(7500, 1)

In [33]:
x_train_transformed = np.concatenate((x_train_age, x_train_fever, x_train_gender_city, x_train_cough), axis=1)

x_test_transformed = np.concatenate((x_test_age, x_test_fever, x_test_gender_city, x_test_cough), axis=1)

x_train_transformed.shape

(7500, 9)

In [34]:
x_train_transformed

array([[ 26. , 102.9,   0. , ...,   1. ,   0. ,   0. ],
       [ 56. , 102.2,   0. , ...,   0. ,   0. ,   0. ],
       [ 39. , 102.1,   0. , ...,   0. ,   0. ,   1. ],
       ...,
       [ 70. , 102.8,   1. , ...,   0. ,   0. ,   0. ],
       [ 20. , 103. ,   1. , ...,   0. ,   0. ,   1. ],
       [ 79. , 102.4,   1. , ...,   0. ,   0. ,   2. ]])

# **mentos zindagi**

In [35]:
from sklearn.compose import ColumnTransformer

In [37]:
transformer = ColumnTransformer(transformers= [
    ('tnf1', SimpleImputer(), ['fever']),
    ('tnf2', OrdinalEncoder(categories= [['low', 'mild', 'high']]), ['cough']),
    ('tnf3', OneHotEncoder(sparse_output = False, drop = 'first'), ['gender', 'city'])
], remainder = 'passthrough')

In [38]:
transformer.fit_transform(x_train)

array([[102.9,   0. ,   0. , ...,   1. ,   0. ,  26. ],
       [102.2,   0. ,   0. , ...,   0. ,   0. ,  56. ],
       [102.1,   1. ,   0. , ...,   0. ,   0. ,  39. ],
       ...,
       [102.8,   0. ,   1. , ...,   0. ,   0. ,  70. ],
       [103. ,   1. ,   1. , ...,   0. ,   0. ,  20. ],
       [102.4,   2. ,   1. , ...,   0. ,   0. ,  79. ]])

In [39]:
transformer.transform(x_test)

array([[ 98.1,   0. ,   1. , ...,   0. ,   0. ,  51. ],
       [100.4,   2. ,   1. , ...,   0. ,   0. ,  76. ],
       [103.1,   1. ,   1. , ...,   0. ,   0. ,  31. ],
       ...,
       [ 99.5,   2. ,   0. , ...,   0. ,   0. ,  62. ],
       [100.2,   0. ,   1. , ...,   0. ,   0. ,  18. ],
       [100.9,   0. ,   1. , ...,   0. ,   0. ,  10. ]])