In [None]:
import numpy as np
import pandas as pd
import pickle

In [None]:

with open('data/ass2.pickle', 'rb') as handle:
    data = pd.read_pickle(handle)

X_train, y_train = data['train']
X_dev, y_dev = data['dev']

In [None]:

print(f'Number of Rows: {len(y_train)}')
print(f'The Classes are: {np.unique(y_train)}')
print(f'Class 0: {np.count_nonzero(y_train == 0)}')
print(f'Class 1: {np.count_nonzero(y_train == 1)}')

From the above check we understand that we have a binary classification problem so we will focus on algorithms best suited for binary classification.
We can see that the dataset is unbalanced so we will balance it.
We can also see that we are dealing with a binary classification problem so we will be using algorithms that are a better fit for this kind of problem.

In [None]:
# from lazypredict.Supervised import LazyClassifier

# clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
# models,predictions = clf.fit(X_train, X_dev, y_train, y_dev)

# print(models)

After running LazyPredict we can focus on the few best algorithms we got from it.

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
rus = RandomUnderSampler(random_state=42, replacement=True)
x_rus, y_rus = rus.fit_resample(X_train, y_train)
x_ros, y_ros = ros.fit_resample(X_train, y_train)

print(f'Under sampled: {len(y_rus)}')
print(f'Over sampled: {len(y_ros)}')

In [None]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression

pca = PCA(n_components=5)
pca.fit(X_train)
X_train_pca = pca.transform(X_train)

anova_filter = SelectKBest(score_func=f_regression, k=5)  
anova_filter.fit(X_train, y_train)
X_train_skb = anova_filter.transform(X_train)

In [None]:
# # import libraries
# import seaborn as sns

# import warnings
# warnings.filterwarnings('ignore')

# # create pandas df
# X = pd.DataFrame(X_train_pca)
# y = np.array(y_train)
# X['target'] = pd.DataFrame(y.reshape(-1, 1), columns=["target"])
# X.head(5)

# # check the null values
# X.isnull().sum()

# # pairplot for distribution
# sns.pairplot(X ,hue="target", palette='Set1')

From the above check we understand that we have a binary classification problem so we will focus on algorithms best suited for binary classification.

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score


def cross_validation(clf, X, y, k):
    sc = StandardScaler()
    mean_accuracy = 0
    kf = KFold(n_splits=k, shuffle=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(sc.fit_transform(X_train), y_train)
        mean_accuracy += accuracy_score(y_test, clf.predict(sc.transform(X_test)))
    return mean_accuracy / k

def fit_predict(classifier, X_train, y_train):
    sc = StandardScaler()
    scaled_X_train = sc.fit_transform(X_train)
    classifier.fit(scaled_X_train, y_train)

    train_score = classifier.score(scaled_X_train, y_train)
    dev_score = classifier.score(sc.transform(X_dev), y_dev)
    over_fitting = train_score - dev_score

    print(f'train score: {round(train_score, 3)} test score {round(dev_score, 3)} overfit {round(over_fitting, 3)}')
    return classifier

def fit_predict_poly(classifier):
    transformed_train = PolynomialFeatures(2).fit_transform(X_train)
    transformed_dev = PolynomialFeatures(2).fit_transform(X_dev)
    classifier.fit(transformed_train, y_train)

    train_score = classifier.score(transformed_train, y_train)
    dev_score = classifier.score(transformed_dev, y_dev)
    over_fitting = train_score - dev_score

    return f'train score: {round(train_score, 3)} test score {round(dev_score, 3)} overfit {round(over_fitting)}'

In [51]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

def check_models(X_train, y_train, k):
    print(f'Naive bayes: {cross_validation(GaussianNB(), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'Random Forest: {cross_validation(RandomForestClassifier(random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'SVM: {cross_validation(SVC(random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'AdaBoost: {cross_validation(AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10, max_depth=4), n_estimators=10, learning_rate=0.6, random_state=42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'Histogram Gradient Boosting: {cross_validation(HistGradientBoostingClassifier(), X_train.to_numpy(), y_train.to_numpy(), k)}')
    evc = VotingClassifier(estimators=[('svm',SVC(random_state = 0)),('rf',RandomForestClassifier(random_state = 42)),('hgb',HistGradientBoostingClassifier())],voting='hard')
    print(f'Voting Classifier: {cross_validation(evc, X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'Bagging Classifier: {cross_validation(BaggingClassifier(base_estimator = SVC(), random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'XGBoost: {cross_validation(XGBClassifier(random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')
    print(f'Light GBM: {cross_validation(LGBMClassifier(random_state = 42), X_train.to_numpy(), y_train.to_numpy(), k)}')


print('Original Data:')
check_models(X_train, y_train, 3)
print('Undersampled Data:')
check_models(x_rus, y_rus, 3)
print('Oversampled Data:')
check_models(x_ros, y_ros, 3)


Original:
XGBoost: 0.858968039037406
Light GBM: 0.865172008020855
Undersampled:
XGBoost: 0.8442609310382152
Light GBM: 0.8440068212875942
Oversampled:
XGBoost: 0.8879449838187702
Light GBM: 0.865898058252427


In [52]:
cross_validation(RandomForestClassifier(random_state = 42), x_ros.to_numpy(), y_ros.to_numpy(), 5)