In [1]:

import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectFromModel

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
sub_df = pd.read_csv('sample_submission.csv')
sub_df.head()


Unnamed: 0,id,outcome
0,1235,lived
1,1236,lived
2,1237,lived
3,1238,lived
4,1239,lived


In [4]:
train1_df = pd.read_csv('train.csv')
train1_df.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [5]:
train_df.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [6]:
train1_df['outcome'].unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [7]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1235 entries, 0 to 1234
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     1235 non-null   int64  
 1   surgery                1235 non-null   object 
 2   age                    1235 non-null   object 
 3   hospital_number        1235 non-null   int64  
 4   rectal_temp            1235 non-null   float64
 5   pulse                  1235 non-null   float64
 6   respiratory_rate       1235 non-null   float64
 7   temp_of_extremities    1235 non-null   object 
 8   peripheral_pulse       1235 non-null   object 
 9   mucous_membrane        1235 non-null   object 
 10  capillary_refill_time  1235 non-null   object 
 11  pain                   1235 non-null   object 
 12  peristalsis            1235 non-null   object 
 13  abdominal_distention   1235 non-null   object 
 14  nasogastric_tube       1235 non-null   object 
 15  naso

In [8]:
train_df.isnull().sum()

id                       0
surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
outcome                  0
dtype: int64

In [9]:

test_df.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,1235,no,adult,534053,38.6,40.0,20.0,normal,normal,normal_pink,...,distend_small,42.0,7.5,clear,2.3,no,0,0,0,no
1,1236,yes,adult,528469,38.2,112.0,48.0,cool,reduced,bright_pink,...,distend_small,44.0,6.0,serosanguious,2.6,no,2208,0,0,yes
2,1237,yes,adult,528178,37.7,66.0,12.0,cool,normal,bright_red,...,distend_small,31.5,6.0,cloudy,1.6,yes,2205,0,0,yes
3,1238,no,adult,534784,37.1,88.0,20.0,cool,reduced,pale_cyanotic,...,distend_large,75.0,81.0,,1.0,yes,1400,0,0,no
4,1239,yes,adult,529840,38.3,50.0,12.0,,normal,bright_pink,...,distend_small,37.0,6.8,cloudy,2.6,yes,2208,0,0,yes


In [10]:

test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 824 entries, 0 to 823
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     824 non-null    int64  
 1   surgery                824 non-null    object 
 2   age                    824 non-null    object 
 3   hospital_number        824 non-null    int64  
 4   rectal_temp            824 non-null    float64
 5   pulse                  824 non-null    float64
 6   respiratory_rate       824 non-null    float64
 7   temp_of_extremities    824 non-null    object 
 8   peripheral_pulse       824 non-null    object 
 9   mucous_membrane        824 non-null    object 
 10  capillary_refill_time  824 non-null    object 
 11  pain                   824 non-null    object 
 12  peristalsis            824 non-null    object 
 13  abdominal_distention   824 non-null    object 
 14  nasogastric_tube       824 non-null    object 
 15  nasoga

In [11]:
train_df.loc[train_df['outcome']=='lived',['outcome','age']]


Unnamed: 0,outcome,age
2,lived,adult
3,lived,adult
4,lived,adult
5,lived,adult
8,lived,adult
...,...,...
1229,lived,adult
1230,lived,adult
1232,lived,young
1233,lived,adult


In [12]:
def label_encoder(data):
    encoded= data.copy()
    cat_var = encoded.select_dtypes(include=np.object).columns

    encoder = LabelEncoder()
    for col in cat_var:
        encoded[col] = encoder.fit_transform(encoded[col])
    return encoded, encoder

In [13]:
train_encoded, encoder = label_encoder(train_df)
print(train_encoded,encoder)

        id  surgery  age  hospital_number  rectal_temp  pulse  \
0        0        1    0           530001         38.1  132.0   
1        1        1    0           533836         37.5   88.0   
2        2        1    0           529812         38.3  120.0   
3        3        1    0          5262541         37.1   72.0   
4        4        0    0          5299629         38.0   52.0   
...    ...      ...  ...              ...          ...    ...   
1230  1230        1    0           535246         38.5  129.0   
1231  1231        1    0           528570         37.5   60.0   
1232  1232        1    1           529685         37.5   84.0   
1233  1233        1    0           534784         38.1   70.0   
1234  1234        1    0           528548         38.1   54.0   

      respiratory_rate  temp_of_extremities  peripheral_pulse  \
0                 24.0                    2                 4   
1                 12.0                    2                 3   
2                 28.0  

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  cat_var = encoded.select_dtypes(include=np.object).columns


In [14]:
def label_encoder2(data2):
    encoded2= data2.copy()
    cat_var2 = encoded2.select_dtypes(include=np.object).columns

    encoder2 = LabelEncoder()
    for col in cat_var2:
        encoded2[col] = encoder2.fit_transform(encoded2[col])
    return encoded2, encoder2

In [15]:
test_encoded, encoder2 = label_encoder2(test_df)
print(test_encoded,encoder2)

       id  surgery  age  hospital_number  rectal_temp  pulse  \
0    1235        0    0           534053         38.6   40.0   
1    1236        1    0           528469         38.2  112.0   
2    1237        1    0           528178         37.7   66.0   
3    1238        0    0           534784         37.1   88.0   
4    1239        1    0           529840         38.3   50.0   
..    ...      ...  ...              ...          ...    ...   
819  2054        0    0           529461         40.3  114.0   
820  2055        1    0           535338         37.2  100.0   
821  2056        1    0           529640         39.2  132.0   
822  2057        0    0          5287179         38.3   54.0   
823  2058        1    0           528461         38.1   66.0   

     respiratory_rate  temp_of_extremities  peripheral_pulse  mucous_membrane  \
0                20.0                    3                 3                4   
1                48.0                    2                 4         

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  cat_var2 = encoded2.select_dtypes(include=np.object).columns


In [16]:
train_df.columns

Index(['id', 'surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3',
       'cp_data', 'outcome'],
      dtype='object')

In [17]:
train_df= train_encoded.fillna(method='bfill')
x_df= train_df.drop(columns='outcome', axis=1)
y= train_df['outcome']
random_forest= RandomForestClassifier().fit(x_df, y)
fs = random_forest.feature_importances_
selected_columns=pd.DataFrame({'Feature':x_df.columns, 'Importance':fs})
selected_columns=selected_columns.sort_values(by= 'Importance',ascending= False)
twenty_df= selected_columns['Feature'][:20].tolist()

In [18]:
test_df=test_encoded.fillna(method='bfill')

In [19]:
random_classifire= RandomForestClassifier(random_state = 42).fit(x_df,y)

In [20]:
test_df.head()

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,1235,0,0,534053,38.6,40.0,20.0,3,3,4,...,2,42.0,7.5,1,2.3,0,0,0,0,0
1,1236,1,0,528469,38.2,112.0,48.0,2,4,1,...,2,44.0,6.0,3,2.6,0,2208,0,0,1
2,1237,1,0,528178,37.7,66.0,12.0,2,3,2,...,2,31.5,6.0,2,1.6,1,2205,0,0,1
3,1238,0,0,534784,37.1,88.0,20.0,2,4,5,...,1,75.0,81.0,0,1.0,1,1400,0,0,0
4,1239,1,0,529840,38.3,50.0,12.0,0,3,1,...,2,37.0,6.8,2,2.6,1,2208,0,0,1


In [None]:
label_mapping = {
    0: 'died',
    1: 'euthanized',
    2: 'lived'
}


In [18]:
y_predictions = random_classifire.predict(test_df)

In [19]:
y_predictions

array([2, 0, 2, 1, 2, 0, 0, 0, 2, 2, 0, 2, 1, 2, 0, 2, 2, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 1, 2, 0, 2, 0, 0, 2, 2, 0, 2, 2, 0, 2, 2, 1, 0, 2,
       0, 2, 0, 2, 2, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       1, 0, 2, 2, 0, 2, 2, 2, 1, 0, 1, 2, 0, 0, 0, 1, 2, 1, 2, 2, 2, 1,
       1, 2, 1, 2, 0, 1, 2, 1, 2, 0, 0, 0, 0, 2, 2, 1, 1, 1, 0, 1, 2, 2,
       0, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 1, 0, 2, 0, 0, 2, 2,
       0, 2, 0, 2, 2, 1, 2, 0, 2, 0, 0, 0, 2, 2, 1, 2, 0, 2, 2, 2, 2, 0,
       0, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, 1, 2, 2, 2, 2, 2, 0, 2, 0, 0, 2,
       0, 0, 0, 1, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 0, 1, 2, 2, 2, 2,
       0, 1, 1, 0, 0, 2, 0, 2, 0, 2, 2, 2, 0, 2, 1, 2, 2, 0, 0, 0, 2, 0,
       2, 2, 2, 2, 0, 0, 0, 2, 0, 2, 0, 1, 0, 0, 2, 0, 2, 2, 1, 0, 0, 0,
       0, 2, 2, 0, 2, 2, 2, 0, 1, 2, 2, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0,
       2, 0, 2, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 2,

In [21]:
sample_df = pd.read_csv('test.csv')
sample1_df=sample_df['id']

In [None]:
label_mapping = {
    0: 'died',
    1: 'euthanized',
    2: 'lived'
}
predicted_labels = [label_mapping[prediction] for prediction in y_predictions]


In [24]:
outcome=y_predictions
output_df= pd.DataFrame({'id': sample1_df, 'outcome': outcome})
output_df.to_csv('output.csv', index=False)