In [293]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [294]:
train_data = pd.read_csv('train.csv')
test_data= pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [295]:
train_data.info

<bound method DataFrame.info of         id surgery    age  hospital_number  rectal_temp  pulse  \
0        0     yes  adult           530001         38.1  132.0   
1        1     yes  adult           533836         37.5   88.0   
2        2     yes  adult           529812         38.3  120.0   
3        3     yes  adult          5262541         37.1   72.0   
4        4      no  adult          5299629         38.0   52.0   
...    ...     ...    ...              ...          ...    ...   
1230  1230     yes  adult           535246         38.5  129.0   
1231  1231     yes  adult           528570         37.5   60.0   
1232  1232     yes  young           529685         37.5   84.0   
1233  1233     yes  adult           534784         38.1   70.0   
1234  1234     yes  adult           528548         38.1   54.0   

      respiratory_rate temp_of_extremities peripheral_pulse mucous_membrane  \
0                 24.0                cool          reduced   dark_cyanotic   
1                

1. Data Cleaning, Preproessing and Wrangling

In [296]:
missing = train_data.isnull().sum() / len(train_data) * 100
missing = missing[missing > 0].sort_values(ascending=False)
print(missing)

abdomen                  17.246964
rectal_exam_feces        15.384615
nasogastric_tube          6.477733
peripheral_pulse          4.858300
abdomo_appearance         3.886640
pain                      3.562753
temp_of_extremities       3.157895
abdominal_distention      1.862348
mucous_membrane           1.700405
nasogastric_reflux        1.700405
peristalsis               1.619433
capillary_refill_time     0.485830
dtype: float64


In [297]:
drop_col = missing[missing > 10].index.tolist()
train_data = train_data.drop(columns = drop_col)
test_data = test_data.drop(columns= drop_col)
train_data = train_data.drop(columns= 'id')
test_data = test_data.drop(columns= 'id')

In [298]:
numeric_features = train_data.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = train_data.select_dtypes(exclude=[np.number]).columns.tolist()

In [251]:
scaler = StandardScaler()
train_data[numeric_features] = scaler.fit_transform(train_data[numeric_features])
test_data[numeric_features] = scaler.fit_transform(test_data[numeric_features])

In [299]:
for feature in numeric_features:
    if feature != 'outcome':
        median_value = train_data[feature].median()
        train_data[feature].fillna(median_value, inplace=True)
        test_data[feature].fillna(median_value, inplace=True)
for feature in categorical_features:
    if feature != 'outcome':
      mode_value = train_data[feature].mode()[0]
      train_data[feature].fillna(mode_value, inplace=True)
      test_data[feature].fillna(mode_value, inplace=True)

In [300]:
X = train_data
test_X = test_data

In [301]:
df_1 = pd.get_dummies(X['mucous_membrane'])
df_1 = df_1.astype(int)
X.drop('mucous_membrane', axis = 1, inplace= True)
X = pd.concat([X, df_1], axis=1)

le = LabelEncoder()   
columns_for_le = ['surgical_lesion', 'surgery', 'age','cp_data']
for col in columns_for_le :
    X[col] = le.fit_transform(X[col])

In [302]:
X['temp_of_extremities'] = X['temp_of_extremities'].map({'cold':0,'cool':1,'normal' : 2, 'warm' : 3})
X['peripheral_pulse'] = X['peripheral_pulse'].map({'absent': 0, 'reduced': 1, 'normal': 2, 'increased': 3})
X['capillary_refill_time'] = X['capillary_refill_time'].map({'less_3_sec':0,'3':1,'more_3_sec':2})
X['pain'] = X['pain'].map({'alert': 0, 'depressed': 1, 'slight': 1, 'mild_pain': 2, 'severe_pain': 3, 'extreme_pain': 4})
X['peristalsis'] = X['peristalsis'].map({'hypermotile': 0, 'normal': 1, 'distend_small':1, 'hypomotile': 2, 'absent': 3})
X['abdominal_distention'] = X['abdominal_distention'].map({'none': 0, 'slight': 1, 'moderate': 2, 'severe': 3})
X['nasogastric_reflux'] = X['nasogastric_reflux'].map({'less_1_liter': 0, 'none': 1, 'slight':1,'more_1_liter': 2})
X['nasogastric_tube'] = X['nasogastric_tube'].map({'none': 0, 'slight': 1, 'significant': 2})
X['abdomo_appearance'] = X['abdomo_appearance'].map({'clear': 0, 'cloudy': 1, 'serosanguious': 2})

In [303]:
df_2 = pd.get_dummies(test_X['mucous_membrane'])
df_2 = df_2.astype(int)
test_X.drop('mucous_membrane', axis = 1, inplace= True)
test_X = pd.concat([test_X, df_2], axis=1)

le = LabelEncoder()   
columns_for_le = ['surgical_lesion', 'surgery', 'age','cp_data']
for col in columns_for_le :
    test_X[col] = le.fit_transform(test_X[col])

In [304]:
test_X['temp_of_extremities'] = test_X['temp_of_extremities'].map({'cold':0,'cool':1,'normal' : 2, 'warm' : 3})
test_X['peripheral_pulse'] = test_X['peripheral_pulse'].map({'absent': 0, 'reduced': 1, 'normal': 2, 'increased': 3})
test_X['capillary_refill_time'] = test_X['capillary_refill_time'].map({'less_3_sec':0,'3':1,'more_3_sec':2})
test_X['pain'] = test_X['pain'].map({'alert': 0, 'depressed': 1, 'moderate': 1, 'mild_pain': 2, 'severe_pain': 3, 'extreme_pain': 4})
test_X['peristalsis'] = test_X['peristalsis'].map({'hypermotile': 0, 'normal': 1, 'distend_small':1, 'hypomotile': 2, 'absent': 3})
test_X['abdominal_distention'] = test_X['abdominal_distention'].map({'none': 0, 'slight': 1, 'moderate': 2, 'severe': 3})
test_X['nasogastric_tube'] = test_X['nasogastric_tube'].map({'none': 0, 'slight': 1, 'significant': 2})
test_X['nasogastric_reflux'] = test_X['nasogastric_reflux'].map({'less_1_liter': 0, 'none': 1, 'slight':1,'more_1_liter': 2})
test_X['abdomo_appearance'] = test_X['abdomo_appearance'].map({'clear': 0, 'cloudy': 1, 'serosanguious': 2})


In [305]:
X = X.drop('lesion_3', axis=1)
test_X = test_X.drop('lesion_3', axis=1)

In [306]:
import xgboost as xgb
from sklearn.model_selection import cross_val_score

X['outcome'] = le.fit_transform(X['outcome'])

xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42).fit(X.drop('outcome',axis=1),X['outcome'])

xgb_pred = xgb_model.predict(test_X)


In [307]:
k = 5 
scores = cross_val_score(xgb_model, X.drop('outcome', axis=1), X['outcome'], cv=k, scoring='accuracy')

for fold, score in enumerate(scores, start=1):
    print(f"Fold {fold} Accuracy: {score:.2f}")
    
mean_accuracy = scores.mean()
print(f"Mean Accuracy: {mean_accuracy:.2f}")
xgb_model.fit(X.drop('outcome', axis=1), X['outcome'])
xgb_pred = xgb_model.predict(test_X)

class_mapping = {0: 'died', 1: 'euthanized', 2: 'lived'}
xgb_pred_str = [class_mapping[pred] for pred in xgb_pred]
xgb_pred_str

Fold 1 Accuracy: 0.64
Fold 2 Accuracy: 0.72
Fold 3 Accuracy: 0.76
Fold 4 Accuracy: 0.64
Fold 5 Accuracy: 0.71
Mean Accuracy: 0.69


['lived',
 'died',
 'lived',
 'euthanized',
 'lived',
 'died',
 'died',
 'died',
 'lived',
 'lived',
 'died',
 'lived',
 'lived',
 'euthanized',
 'died',
 'lived',
 'lived',
 'died',
 'died',
 'died',
 'died',
 'died',
 'lived',
 'lived',
 'died',
 'lived',
 'died',
 'euthanized',
 'lived',
 'died',
 'lived',
 'lived',
 'died',
 'died',
 'lived',
 'euthanized',
 'lived',
 'lived',
 'died',
 'lived',
 'died',
 'euthanized',
 'died',
 'euthanized',
 'lived',
 'lived',
 'died',
 'lived',
 'lived',
 'died',
 'died',
 'lived',
 'died',
 'died',
 'died',
 'lived',
 'died',
 'died',
 'euthanized',
 'died',
 'died',
 'euthanized',
 'lived',
 'died',
 'died',
 'lived',
 'euthanized',
 'died',
 'lived',
 'died',
 'died',
 'died',
 'lived',
 'lived',
 'euthanized',
 'died',
 'euthanized',
 'euthanized',
 'died',
 'euthanized',
 'died',
 'euthanized',
 'lived',
 'euthanized',
 'lived',
 'lived',
 'lived',
 'euthanized',
 'euthanized',
 'lived',
 'euthanized',
 'lived',
 'died',
 'euthanized',
 'eu

In [308]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgbm_model = HistGradientBoostingClassifier().fit(X.drop('outcome',axis=1),X['outcome'])
hgbm_pred = hgbm_model.predict(test_X)


In [309]:
k = 5 
scores = cross_val_score(hgbm_model, X.drop('outcome', axis=1), X['outcome'], cv=k, scoring='accuracy')

for fold, score in enumerate(scores, start=1):
    print(f"Fold {fold} Accuracy: {score:.2f}")
    
mean_accuracy = scores.mean()
print(f"Mean Accuracy: {mean_accuracy:.2f}")
hgbm_model.fit(X.drop('outcome', axis=1), X['outcome'])
hgbm_pred = hgbm_model.predict(test_X)

class_mapping = {0: 'died', 1: 'euthanized', 2: 'lived'}
hgbm_pred_str = [class_mapping[pred] for pred in hgbm_pred]
hgbm_pred_str

Fold 1 Accuracy: 0.64
Fold 2 Accuracy: 0.75
Fold 3 Accuracy: 0.74
Fold 4 Accuracy: 0.65
Fold 5 Accuracy: 0.68
Mean Accuracy: 0.69


['lived',
 'died',
 'lived',
 'euthanized',
 'lived',
 'died',
 'lived',
 'died',
 'lived',
 'lived',
 'died',
 'lived',
 'euthanized',
 'euthanized',
 'died',
 'lived',
 'died',
 'died',
 'died',
 'died',
 'died',
 'died',
 'lived',
 'lived',
 'died',
 'lived',
 'died',
 'euthanized',
 'lived',
 'died',
 'lived',
 'died',
 'died',
 'died',
 'lived',
 'died',
 'lived',
 'lived',
 'died',
 'lived',
 'died',
 'euthanized',
 'died',
 'lived',
 'lived',
 'lived',
 'died',
 'lived',
 'lived',
 'died',
 'died',
 'lived',
 'died',
 'died',
 'lived',
 'lived',
 'died',
 'died',
 'euthanized',
 'died',
 'died',
 'euthanized',
 'lived',
 'died',
 'died',
 'lived',
 'euthanized',
 'died',
 'lived',
 'lived',
 'died',
 'died',
 'lived',
 'lived',
 'euthanized',
 'died',
 'lived',
 'died',
 'died',
 'euthanized',
 'died',
 'euthanized',
 'lived',
 'euthanized',
 'lived',
 'lived',
 'lived',
 'euthanized',
 'euthanized',
 'lived',
 'euthanized',
 'lived',
 'died',
 'euthanized',
 'euthanized',
 'eut

In [291]:
sample_submission['outcome'] = hgbm_pred_str
sample_submission['outcome'] = sample_submission['outcome']
sample_submission.to_csv('submission.csv',index=False)