In [39]:
import os
import numpy as np
import pandas as pd
# from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import OneHotEncoder


In [74]:
def read_data_from_csv(path):
    """Load datasets from CSV files.
    Args:
        path (str): Path to the CSV file.
    Returns:
        X (np.ndarray): Features of samples.
        y (np.ndarray): Labels of samples, only provided in the public datasets.
    """
    assert os.path.exists(path), f'File not found: {path}!'
    assert os.path.splitext(path)[
        -1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'

    data = pd.read_csv(path)
    column_list = data.columns.values.tolist()

    if 'Label' in column_list:
        # for the public dataset, label column is provided.
        column_list.remove('Label')
        X = data[column_list].values
        y = data['Label'].astype('int').values
        return X, y
    else:
        # for the private dataset, label column is not provided.
        X = data[column_list].values
        return X

def renameCols(df: pd.DataFrame):
    df_new = df.rename(
        columns={
            0: 'Credit', 
            1: 'Gender',
            2: 'Edu',
            3: 'Marital',
            4: 'Age',
            5: 'RePay_Sep',
            6: 'RePay_Aug',
            7: 'RePay_Jul',
            8: 'RePay_Jun',
            9: 'RePay_May',
            10: 'RePay_Apr',
            11: 'Bill_Sep',
            12: 'Bill_Aug',
            13: 'Bill_Jul',
            14: 'Bill_Jun',
            15: 'Bill_May',
            16: 'Bill_Apr',
            17: 'Paid_Sep',
            18: 'Paid_Aug',
            19: 'Paid_Jul',
            20: 'Paid_Jun',
            21: 'Paid_May',
            22: 'Paid_Apr'
        }
    ) 
    return df_new

def handleCateg(df: pd.DataFrame):
    # binary label / category: 2
    # categories: 3, 4
    # rating categories: 6-11
    df_new = df
    # df['Class'] = df['Class'].map({'Individual':'Individual', 'Group':'Group'}).fillna('Other')
    df_isMale = df_new['Gender'].map({1: True, 2: False}).fillna(False)
    df_new['Edu'] = df_new['Edu'].map({1:'graduateSc', 2:'university', 3:'highSc'}).fillna('others')
    df_new['Marital'] = df_new['Marital'].map({1:'married', 2:'single'}).fillna('others')
    df_new['Gender'] = df_isMale

    return df_new

def oneHotEncoding(df: pd.DataFrame):
    df_ohe = pd.get_dummies(df)
    df_ohe = df_ohe[[
        'Credit', 'Gender', 
	    'Edu_graduateSc', 'Edu_highSc', 'Edu_university', 'Edu_others',
	    'Marital_married', 'Marital_single', 'Marital_others',
	    'Age',
        'RePay_Sep', 'RePay_Aug', 'RePay_Jul', 'RePay_Jun', 'RePay_May', 'RePay_Apr',
        'Bill_Sep', 'Bill_Aug', 'Bill_Jul', 'Bill_Jun', 'Bill_May', 'Bill_Apr',
        'Paid_Sep', 'Paid_Aug', 'Paid_Jul', 'Paid_Jun', 'Paid_May', 'Paid_Apr'
    ]]

    return df_ohe

In [75]:
X_public, y_public = read_data_from_csv('assignment_6_public.csv')
print('Shape of X_public:', X_public.shape)  # n_sample, m_feature (20000, 23)
print('Shape of y_public:', y_public.shape)  # n_sample (20000,)

X_private = read_data_from_csv('assignment_6_private.csv')
print('Shape of X_private:', X_private.shape)  # k_sample, m_feature (5000, 23)

df_public = pd.DataFrame(X_public)

# rename columns, categorize and one-hot encoding
preprocess_x = oneHotEncoding(handleCateg(renameCols(df_public)))


Shape of X_public: (20000, 23)
Shape of y_public: (20000,)
Shape of X_private: (5000, 23)


In [76]:
preprocess_x

Unnamed: 0,Credit,Gender,Edu_graduateSc,Edu_highSc,Edu_university,Edu_others,Marital_married,Marital_single,Marital_others,Age,...,Bill_Jul,Bill_Jun,Bill_May,Bill_Apr,Paid_Sep,Paid_Aug,Paid_Jul,Paid_Jun,Paid_May,Paid_Apr
0,20000,True,False,False,True,False,True,False,False,24,...,17473,16900,17690,18255,1000,0,0,1376,1000,649
1,130000,False,False,False,True,False,False,True,False,24,...,41,1062,-201,-201,0,41,1021,0,0,0
2,120000,True,True,False,False,False,False,True,False,27,...,38083,36728,36952,35475,2415,1816,1381,1264,1228,1217
3,300000,True,False,True,False,False,False,True,False,51,...,6246,3872,11875,5290,19854,6279,3883,11883,5305,9998
4,130000,False,False,True,False,False,True,False,False,43,...,129385,96775,98071,97743,6739,5540,4403,3814,3562,4021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,170000,False,False,False,True,False,False,True,False,35,...,118111,92592,96114,99628,5000,5000,10000,5000,5000,5000
19996,70000,False,True,False,False,False,False,True,False,31,...,2400,2400,2400,2400,0,0,0,0,0,0
19997,280000,False,True,False,False,False,False,True,False,52,...,930,0,0,0,0,930,0,0,0,0
19998,20000,True,True,False,False,False,False,True,False,23,...,19753,19160,19661,19816,2300,0,0,788,558,198


In [77]:
preprocess_x.columns

Index(['Credit', 'Gender', 'Edu_graduateSc', 'Edu_highSc', 'Edu_university',
       'Edu_others', 'Marital_married', 'Marital_single', 'Marital_others',
       'Age', 'RePay_Sep', 'RePay_Aug', 'RePay_Jul', 'RePay_Jun', 'RePay_May',
       'RePay_Apr', 'Bill_Sep', 'Bill_Aug', 'Bill_Jul', 'Bill_Jun', 'Bill_May',
       'Bill_Apr', 'Paid_Sep', 'Paid_Aug', 'Paid_Jul', 'Paid_Jun', 'Paid_May',
       'Paid_Apr'],
      dtype='object')

In [85]:
analysis_train = pd.concat([handleCateg(renameCols(df_public)), pd.DataFrame(y_public)], axis=1).rename(columns={0: 'Target'})
analysis_train.value_counts(['Edu']).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,count
Edu,Unnamed: 1_level_1
graduateSc,7108
highSc,3289
others,311
university,9292


In [86]:
analysis_train.value_counts(['Edu', 'Target']).sort_index(ascending=True).to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Edu,Target,Unnamed: 2_level_1
graduateSc,0,5727
graduateSc,1,1381
highSc,0,2456
highSc,1,833
others,0,286
others,1,25
university,0,7104
university,1,2188


In [None]:
# remove and make your own predictions.
preds = np.full(len(X_private), -1,
                dtype=int)
'''
CODE HERE!
e.g.,
preds = np.full(len(X_private), -1, dtype=int)
'''

submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_6.csv', index=True, index_label='Id')