In [37]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt

from itertools import permutations

from sklearn.linear_model import RidgeClassifier, LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.decomposition import PCA

from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [38]:
df_train = pd.read_csv('/kaggle/input/students-drop-out-prediction/train.csv')
df_train.shape

In [39]:
# df_train.isnull().sum()

In [40]:
X, y = df_train.drop(axis=1, columns=['id', 'label']), df_train.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape

In [41]:
estimators = [
    ('logReg', LogisticRegression(C=1, class_weight=None, fit_intercept=True, penalty='l1', solver='saga')),    
    ('svc', SVC(C=1, break_ties=False, decision_function_shape='ovr', degree=2, gamma='scale', kernel='poly', shrinking=True)),
    ('ridge', RidgeClassifier(alpha=0.0001, class_weight=None, copy_X=True, fit_intercept=True, normalize=False, positive=False, solver='sag')),
    ('ranFor', RandomForestClassifier(ccp_alpha=0.00043917435221783044, bootstrap=True, class_weight='balanced', criterion='entropy', max_features='log2', max_samples=0.3, min_samples_leaf=1, min_samples_split=4, n_estimators=100, oob_score=False, warm_start=False)),
    ('nnMLP', MLPClassifier(activation='relu', alpha=0.01, hidden_layer_sizes=(30,20,10), learning_rate='adaptive', solver='adam', warm_start=True)),
    ('sgdC', SGDClassifier(alpha=0.001,  class_weight=None,  eta0=1e-06,  fit_intercept=True,  l1_ratio=0,  learning_rate='optimal',  loss='log',  penalty=None,  shuffle=True,  warm_start=True)),
]

weights = list(permutations(range(1, len(estimators)+1)))
weights.append(None)
# print(l)
# weights = [
#     None,
#     [2, 3, 1, 4],    
#     [1, 2, 3, 4]
# ]


pipe = Pipeline([
    ('pca', PCA()),
    ('scale', MinMaxScaler()),           
    ('classifier', VotingClassifier(estimators=estimators)),    
])

params = {
    'pca__n_components': [41], 
    'classifier__voting': ['hard',],    
    'classifier__weights': weights,    
}

({'classifier__voting': 'hard',
  'classifier__weights': (3, 4, 1, 2),
  'pca__n_components': 41},
 0.7378116058267579)

In [None]:
%%time
clf = GridSearchCV(pipe, params, cv=5, scoring='f1_micro', verbose=10, refit=True)
search = clf.fit(X_train, y_train)
search.best_params_, search.best_score_

In [None]:
# clf.cv_results_

In [None]:
# pipe.fit(X_train, y_train)
# pipe.score(X_train, y_train)

y_train_pred = clf.predict(X_train)
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred)

In [None]:
print(classification_report(y_train, y_train_pred))

In [None]:
y_train.value_counts()

In [None]:
f1_score(y_train, y_train_pred, average='micro')

# Test F1-score

In [None]:
y_test_pred = clf.predict(X_test)
f1_score(y_test, y_test_pred, average='micro')

# Output

In [None]:
df_test_final = pd.read_csv('/kaggle/input/students-drop-out-prediction/test.csv')
ids_test_final = df_test_final.id  # row id of final test cases
X_test_final =  df_test_final.drop(axis=1, columns=['id'])
y_test_pred = clf.predict(X_test_final) 
# y_test_pred
submission_df = pd.DataFrame(list(zip(ids_test_final, y_test_pred)), columns =['id', 'label'])
submission_df.to_csv('output.csv', index=False)