Using GradientBoosting classifer

In [2]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score

def read_data_from_csv(path):
    """Load datasets from CSV files.
    Args:
        path (str): Path to the CSV file.
    Returns:
        X (np.ndarray): Features of samples.
        y (np.ndarray): Labels of samples, only provided in the public datasets.
    """
    assert os.path.exists(path), f'File not found: {path}!'
    assert os.path.splitext(path)[
        -1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'

    data = pd.read_csv(path)
    column_list = data.columns.values.tolist()

    if 'Label' in column_list:
        # for the public dataset, label column is provided.
        column_list.remove('Label')
        X = data[column_list].values
        y = data['Label'].astype('int').values
        return X, y
    else:
        # for the private dataset, label column is not provided.
        X = data[column_list].values
        return X
    
def normalize(df: pd.DataFrame):
    df_norm = df
    # Continuous features   1-11, 18-29, 38-58
    df_1T11 = df_norm.iloc[:, 0:11]  
    df_18T19 = df_norm.iloc[:, 17:29]
    df_38T58 = df_norm.iloc[:, 37:58]

    # Categorical features  12-17, 30-37 
    df_12T17 = df_norm.iloc[:, 11:17]
    df_30T37 = df_norm.iloc[:, 29:37]

    # concatenating along columns
    horizontal_Continuous_concat = pd.concat([df_1T11, df_18T19, df_38T58], axis=1)
    horizontal_Categorical_concat = pd.concat([df_12T17, df_30T37], axis=1)

    # Normalize the numeric features
    scaler = MinMaxScaler()
    normalized_Continuous = pd.DataFrame(scaler.fit_transform(pd.DataFrame(horizontal_Continuous_concat)))

    # change Categorical features data type from float to boolean
    for column in horizontal_Categorical_concat:
        horizontal_Categorical_concat[column] = horizontal_Categorical_concat[column].astype(bool)

    print(horizontal_Continuous_concat.shape)
    print(horizontal_Categorical_concat.shape)
    
    #  combine to full dataset
    df_fullConcat = pd.concat([normalized_Continuous, horizontal_Categorical_concat], axis=1)

    return df_fullConcat

X_public, y_public = read_data_from_csv('assignment_5_public.csv')
print('Shape of X_public:', X_public.shape)  # n_sample, m_feature (30000, 58)
print('Shape of y_public:', y_public.shape)  # n_sample (30000,)

df_train_x = pd.DataFrame(X_public)
df_train_y = pd.DataFrame(y_public)
# normalized_x = normalize(df_train_x) not scalling is better than normalized

# scores = cross_val_score(GBoost_clf, normalized_x, y_public, cv = 8)
# print("Cross Validation Scores: ", scores)
# print("\nAverage CV Score: ", scores.mean())      #   Average CV Score:  0.671

# Construct model
GBoost_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)



# fit model and generate prediction
X_private = read_data_from_csv('assignment_5_private.csv')
print('Shape of X_private:', X_private.shape)  # k_sample, m_feature (5000, 58)
df_test_x = pd.DataFrame(X_private)
# normalized_test_x = normalize(df_test_x)


GBoost_clf.fit(df_train_x, y_public)
preds = GBoost_clf.predict(df_test_x)

submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_5.csv', index=True, index_label='Id')


Shape of X_public: (30000, 58)
Shape of y_public: (30000,)
Shape of X_private: (5000, 58)
