In [None]:
import pandas as pd
train=pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
train.head()

In [None]:
test=pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
test.head()

In [None]:
train=train.drop(['Id'],axis=1)
train.head()

In [None]:
test=test.drop(['Id'],axis=1)
test.head()

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('object')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
import numpy as np
train = reduce_mem_usage(train)
train.info()

In [None]:
test = reduce_mem_usage(test)
test.info()

In [None]:
col=train.columns.tolist()
col.remove('Class')
col[:5]

In [None]:
y = train[['Class']]
X = train[col]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42,shuffle=True, stratify=y)

In [None]:
params={'n_estimators': 97, 'num_leaves': 33, 'min_child_samples': 8, 
        'learning_rate': 0.3848454051107216, 'log_max_bin': 9, 'colsample_bytree': 1.0,
        'reg_alpha': 0.002010966076190542, 'reg_lambda': 0.3138652985500363}

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
X_train['EJ'] = label_encoder.fit_transform(X_train['EJ'])
X_train.head()

In [None]:
X_test['EJ'] = label_encoder.transform(X_test['EJ'])
X_test.head()

In [None]:
test['EJ'] = label_encoder.transform(test['EJ'])
test.head()

In [None]:
import lightgbm as lgbm
model = lgbm.LGBMClassifier(learning_rate=0.3848454051107216, max_bin=511,
               min_child_samples=8, n_estimators=97, num_leaves=33,
               reg_alpha=0.002010966076190542, reg_lambda=0.3138652985500363,
               verbose=-1)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_train, model.predict(X_train)))

In [None]:
print(classification_report(y_test, model.predict(X_test)))

In [None]:
sample_submission = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
sample_submission.head()

In [None]:
model.predict_proba(test)

In [None]:
sample_submission['class_1'] = model.predict_proba(test)[:,1]
sample_submission['class_0'] = 1-model.predict_proba(test)[:,1]
sample_submission.to_csv("./submission.csv", index=False)
sample_submission.head()