In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [13]:
train = pd.read_csv('Data/train.csv') 
test = pd.read_csv('Data/test.csv')
# train['smoking_status'].value_counts()
train.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,Male,50.0,0,0,Yes,Private,Rural,88.1,29.1,formerly smoked,0
1,2,Female,52.0,0,0,Yes,Private,Rural,80.07,38.9,smokes,0
2,3,Female,26.0,0,0,No,Govt_job,Urban,89.11,23.3,smokes,0
3,4,Female,37.0,0,0,Yes,Private,Rural,81.36,36.1,never smoked,0
4,5,Male,59.0,0,0,Yes,Private,Rural,82.59,29.6,never smoked,1
5,6,Male,57.0,0,0,Yes,Private,Rural,92.04,33.5,never smoked,0
6,7,Male,3.0,0,0,No,children,Rural,97.32,16.9,Unknown,0
7,8,Female,32.0,0,0,Yes,Private,Urban,85.04,41.2,smokes,0
8,9,Male,24.0,0,0,No,Private,Rural,98.23,22.3,smokes,0
9,10,Female,60.0,0,0,Yes,Self-employed,Urban,57.57,35.9,smokes,0


In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12243 entries, 0 to 12242
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 12243 non-null  int64  
 1   gender             12243 non-null  object 
 2   age                12243 non-null  float64
 3   hypertension       12243 non-null  int64  
 4   heart_disease      12243 non-null  int64  
 5   ever_married       12243 non-null  object 
 6   work_type          12243 non-null  object 
 7   Residence_type     12243 non-null  object 
 8   avg_glucose_level  12243 non-null  float64
 9   bmi                12243 non-null  float64
 10  smoking_status     12243 non-null  object 
 11  stroke             12243 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 1.1+ MB


In [15]:
le = LabelEncoder()
for col in ['gender', 'ever_married', 'Residence_type']:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
train.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,1,50.0,0,0,1,Private,0,88.1,29.1,formerly smoked,0
1,2,0,52.0,0,0,1,Private,0,80.07,38.9,smokes,0
2,3,0,26.0,0,0,0,Govt_job,1,89.11,23.3,smokes,0
3,4,0,37.0,0,0,1,Private,0,81.36,36.1,never smoked,0
4,5,1,59.0,0,0,1,Private,0,82.59,29.6,never smoked,1


In [16]:
train = pd.get_dummies(train, columns=['work_type', 'smoking_status'], drop_first=True)
test = pd.get_dummies(test, columns=['work_type', 'smoking_status'], drop_first=True)
train.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,1,50.0,0,0,1,0,88.1,29.1,0,False,True,False,False,True,False,False
1,2,0,52.0,0,0,1,0,80.07,38.9,0,False,True,False,False,False,False,True
2,3,0,26.0,0,0,0,1,89.11,23.3,0,False,False,False,False,False,False,True
3,4,0,37.0,0,0,1,0,81.36,36.1,0,False,True,False,False,False,True,False
4,5,1,59.0,0,0,1,0,82.59,29.6,1,False,True,False,False,False,True,False


In [17]:
numeric_cols = ['age', 'bmi', 'avg_glucose_level']
scaler = StandardScaler()
train[numeric_cols] = scaler.fit_transform(train[numeric_cols])
test[numeric_cols] = scaler.transform(test[numeric_cols])
train.head(5)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,1,0.402231,0,0,1,0,-0.028882,0.152901,0,False,True,False,False,True,False,False
1,2,0,0.495179,0,0,1,0,-0.350636,1.613926,0,False,True,False,False,False,False,True
2,3,0,-0.713135,0,0,0,1,0.011588,-0.711788,0,False,False,False,False,False,False,True
3,4,0,-0.201926,0,0,1,0,-0.298947,1.19649,0,False,True,False,False,False,True,False
4,5,1,0.820494,0,0,1,0,-0.249662,0.227443,1,False,True,False,False,False,True,False


In [18]:
X = train.drop('stroke', axis=1) 
y = train['stroke']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
X

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,1,0.402231,0,0,1,0,-0.028882,0.152901,False,True,False,False,True,False,False
1,2,0,0.495179,0,0,1,0,-0.350636,1.613926,False,True,False,False,False,False,True
2,3,0,-0.713135,0,0,0,1,0.011588,-0.711788,False,False,False,False,False,False,True
3,4,0,-0.201926,0,0,1,0,-0.298947,1.196490,False,True,False,False,False,True,False
4,5,1,0.820494,0,0,1,0,-0.249662,0.227443,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12238,12239,0,-0.434294,0,0,1,1,-0.153096,-0.339077,False,True,False,False,False,False,False
12239,12240,0,-0.480767,0,0,1,1,0.127387,0.301985,False,False,False,False,False,False,False
12240,12241,1,1.099336,0,0,1,0,0.104949,0.421252,False,True,False,False,False,False,False
12241,12242,0,1.564072,0,0,1,1,-0.633122,0.301985,False,True,False,False,True,False,False


In [19]:
model = LogisticRegression(random_state=42, max_iter=10000)
model.fit(X_train, y_train)

In [20]:
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("F1-Score:", f1_score(y_val, y_pred))




Accuracy: 0.953858717844018
F1-Score: 0.06611570247933884


In [21]:
predictions = model.predict(test)
result = pd.DataFrame({'id': test['id'], 'stroke': predictions})
result

Unnamed: 0,id,stroke
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
3056,3057,0
3057,3058,0
3058,3059,0
3059,3060,0
