In [38]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

def read_data_from_csv(path):
    """Load datasets from CSV files.
    Args:
        path (str): Path to the CSV file.
    Returns:
        X (np.ndarray): Features of samples.
        y (np.ndarray): Labels of samples, only provided in the public datasets.
    """
    assert os.path.exists(path), f'File not found: {path}!'
    assert os.path.splitext(path)[
        -1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'

    data = pd.read_csv(path)
    column_list = data.columns.values.tolist()

    if 'Label' in column_list:
        # for the public dataset, label column is provided.
        column_list.remove('Label')
        X = data[column_list].values
        y = data['Label'].astype('int').values
        return X, y
    else:
        # for the private dataset, label column is not provided.
        X = data[column_list].values
        return X
    

X_public, y_public = read_data_from_csv('assignment_5_public.csv')
print('Shape of X_public:', X_public.shape)  # n_sample, m_feature (30000, 58)
print('Shape of y_public:', y_public.shape)  # n_sample (30000,)

'''
CODE HERE!
'''






Shape of X_public: (30000, 58)
Shape of y_public: (30000,)


'\nCODE HERE!\n'

|   Features                    |    Types	        |       data types      | 
|-------------------------------|-------------------|-----------------------|
|       1-11, 18-29, 38-58      |    Continuous     |       numberic number |
|       12-17, 30-37            |   Categorical     |       boolean(1/0)    |

In [15]:
df_x = pd.DataFrame(X_public).head()

In [4]:
pd.DataFrame(y_public).head()

Unnamed: 0,0
0,0
1,0
2,1
3,1
4,0


In [12]:
# try construct the model without any preprocess

k_folds = KFold(n_splits = 8)

logisticReg_model = LogisticRegression(solver = 'liblinear', class_weight = 'balanced', max_iter =300, penalty = 'l1')

scores = cross_val_score(logisticReg_model, X_public, y_public, cv = k_folds)

print("Cross Validation Scores: ", scores)
print("\nAverage CV Score: ", scores.mean())

Cross Validation Scores:  [0.65786667 0.65066667 0.66426667 0.64906667 0.64453333 0.64506667
 0.66506667 0.6496    ]

Average CV Score:  0.6532666666666667


In [35]:
normalized_x = df_x
# Continuous features   1-11, 18-29, 38-58
df_1T11 = df_x.iloc[:, 0:11]  
df_18T19 = df_x.iloc[:, 17:29]
df_38T58 = df_x.iloc[:, 37:58]

# Categorical features  12-17, 30-37 
df_12T17 = df_x.iloc[:, 11:17]  
df_30T37 = df_x.iloc[:, 29:37]  

In [36]:
# concatenating along columns
horizontal_Continuous_concat = pd.concat([df_1T11, df_18T19, df_38T58], axis=1)
horizontal_Categorical_concat = pd.concat([df_12T17, df_30T37], axis=1)

print(horizontal_Continuous_concat.shape)
print(horizontal_Categorical_concat.shape)

(5, 44)
(5, 14)


In [39]:
# Normalize the numeric features
scaler = MinMaxScaler()
normalized_Continuous = scaler.fit_transform(pd.DataFrame(horizontal_Continuous_concat))


In [None]:
X_private = read_data_from_csv('assignment_5_private.csv')
print('Shape of X_private:', X_private.shape)  # k_sample, m_feature (5000, 58)



# remove and make your own predictions.
preds = np.full(len(X_private), -1,
                dtype=int)
'''
CODE HERE!
e.g.,
preds = np.full(len(X_private), -1, dtype=int)
'''

In [None]:
submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_5.csv', index=True, index_label='Id')