# 1. Data preprocessing

In [1]:
import pandas as pd
import json

In [2]:
ENC = json.loads(open('/media/Z/NDT/Transformer_Tabular/GBDT/Ensemble learning/encoding.json').read())

In [3]:
df_train = pd.read_csv('/media/Z/NDT/Transformer_Tabular/dataset/train.csv')
df_test = pd.read_csv('/media/Z/NDT/Transformer_Tabular/dataset/test.csv')
df_val = pd.read_csv('/media/Z/NDT/Transformer_Tabular/dataset/val.csv')

In [4]:
CATEGORY_FEATURES = df_train.columns.difference(df_train.select_dtypes('float').columns).tolist()
NUMBER_FEATURES = list(set(df_train.columns) - set(CATEGORY_FEATURES))
CATEGORY_FEATURES.remove('income_bracket')

In [5]:
def ordinal_encoding(df: pd.DataFrame, category_features: list, encoding: dict) -> pd.DataFrame:
    df_ = df.copy()
    for c in CATEGORY_FEATURES:
        df_[c] = df_[c].map(encoding[c])
    return df_
        

In [6]:
X_train = df_train.drop('income_bracket', axis=1)
y_train = df_train.income_bracket
X_val = df_val.drop('income_bracket', axis=1)
y_val = df_val.income_bracket
X_test = df_test.drop('income_bracket', axis=1)
y_test = df_test.income_bracket

In [7]:
FEATURES = X_train.columns.tolist()

In [8]:
X_train_encoding = ordinal_encoding(X_train, CATEGORY_FEATURES, ENC)
X_val_encoding = ordinal_encoding(X_val, CATEGORY_FEATURES, ENC)
X_test_encoding = ordinal_encoding(X_test, CATEGORY_FEATURES, ENC)

# 2. Modeling

In [9]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [10]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
Model=QuadraticDiscriminantAnalysis()

In [11]:
Model.fit(X_train_encoding, y_train)
y_pred = Model.predict(X_test_encoding)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89     12435
           1       0.70      0.39      0.50      3846

    accuracy                           0.82     16281
   macro avg       0.77      0.67      0.69     16281
weighted avg       0.80      0.82      0.80     16281



## 3. Save and load model

In [12]:
import pickle
filename = 'QuadraticDiscriminantAnalysis.sav'
pickle.dump(Model, open(filename, 'wb'))

In [13]:
loaded_model = pickle.load(open(filename, 'rb'))

In [14]:
y_pred = loaded_model.predict(X_test_encoding)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.95      0.89     12435
           1       0.70      0.39      0.50      3846

    accuracy                           0.82     16281
   macro avg       0.77      0.67      0.69     16281
weighted avg       0.80      0.82      0.80     16281

