### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

### Importing Data
### Cleaning the data

In [2]:
X = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
X = X.drop(["id","screen_time_hours_per_day","ethnicity","education_level","income_level","employment_status","diagnosed_diabetes"],axis=1)
y = X.iloc[:,-1]

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 19 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   age                                 700000 non-null  int64  
 1   alcohol_consumption_per_week        700000 non-null  int64  
 2   physical_activity_minutes_per_week  700000 non-null  int64  
 3   diet_score                          700000 non-null  float64
 4   sleep_hours_per_day                 700000 non-null  float64
 5   bmi                                 700000 non-null  float64
 6   waist_to_hip_ratio                  700000 non-null  float64
 7   systolic_bp                         700000 non-null  int64  
 8   diastolic_bp                        700000 non-null  int64  
 9   heart_rate                          700000 non-null  int64  
 10  cholesterol_total                   700000 non-null  int64  
 11  hdl_cholesterol           

In [4]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 700000 entries, 0 to 699999
Series name: cardiovascular_history
Non-Null Count   Dtype
--------------   -----
700000 non-null  int64
dtypes: int64(1)
memory usage: 5.3 MB


In [5]:
X

Unnamed: 0,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,diastolic_bp,heart_rate,cholesterol_total,hdl_cholesterol,ldl_cholesterol,triglycerides,gender,smoking_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,31,1,45,7.7,6.8,33.4,0.93,112,70,62,199,58,114,102,Female,Current,0,0,0
1,50,2,73,5.7,6.5,23.8,0.83,120,77,71,199,50,121,124,Female,Never,0,0,0
2,32,3,158,8.5,7.4,24.1,0.83,95,89,73,188,59,114,108,Male,Never,0,0,0
3,54,3,77,4.6,7.0,26.6,0.83,121,69,74,182,54,85,123,Female,Current,0,1,0
4,54,1,55,5.7,6.2,28.8,0.90,108,60,85,206,49,131,124,Male,Never,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699995,29,1,59,6.9,5.2,26.1,0.88,133,57,69,163,58,90,126,Female,Former,0,0,0
699996,46,2,72,7.7,7.7,25.5,0.85,106,85,65,188,45,107,119,Female,Former,0,0,1
699997,35,1,50,5.6,6.1,26.9,0.88,127,84,63,168,59,77,166,Female,Never,0,0,0
699998,49,2,70,5.7,6.9,25.2,0.86,116,67,69,198,55,108,133,Female,Never,0,0,0


In [6]:
y

0         0
1         0
2         0
3         0
4         0
         ..
699995    0
699996    1
699997    0
699998    0
699999    0
Name: cardiovascular_history, Length: 700000, dtype: int64

### Feature Engineering

In [7]:
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder , StandardScaler 
from sklearn.pipeline import Pipeline 
from sklearn.linear_model import LogisticRegression

num_cols = ["age","alcohol_consumption_per_week","physical_activity_minutes_per_week","diet_score",
            'sleep_hours_per_day',
            "bmi","waist_to_hip_ratio","systolic_bp","diastolic_bp",
            "heart_rate","cholesterol_total","hdl_cholesterol","ldl_cholesterol","triglycerides"]
cat_cols = [
    'gender',
    'smoking_status'
]
bin_cols = ['family_history_diabetes','hypertension_history','cardiovascular_history']




preprocess = ColumnTransformer(
    transformers=[
        ("num",StandardScaler(),num_cols),
        ("cat",OneHotEncoder(),cat_cols),
        ("bin","passthrough",bin_cols)
    ]
)

model = Pipeline(
    steps=[
        ("pre",preprocess),
        ("log_reg",LogisticRegression(max_iter=1000,class_weight="balanced"))
    ]
)
model.fit(X,y)

0,1,2
,steps,"[('pre', ...), ('log_reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [8]:
model.predict(test)

array([0, 0, 0, ..., 0, 0, 0], shape=(300000,))

In [9]:
sample = pd.read_csv("sample_submission.csv")
sample.iloc[:,-1] = model.predict(test)
sample

Unnamed: 0,id,diagnosed_diabetes
0,700000,0
1,700001,0
2,700002,0
3,700003,0
4,700004,0
...,...,...
299995,999995,0
299996,999996,0
299997,999997,0
299998,999998,0


In [10]:
sample.to_csv("sample_submission.csv",index=False)