In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import SGD, Adagrad, RMSprop, Adam, Adamax
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, CSVLogger
from tensorflow.keras.metrics import Precision, Recall, F1Score

from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.drop(columns=['id'],axis=1, inplace=True)

In [None]:
X = df.drop(columns=['Target'])
y = df[['Target']]

In [None]:
df['Course'].value_counts()

In [None]:
df['Target'].value_counts(normalize=True) * 100

In [None]:
df.groupby(['Target'])['Course'].value_counts()

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
encoder = OneHotEncoder(sparse_output=False)
y = encoder.fit_transform(y.values.reshape(-1, 1))

In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
xgb = XGBClassifier(n_extimators=50)
xgb.fit(X_train, y_train)

In [None]:
print(xgb.score(X_train, y_train))

In [None]:
print(xgb.score(X_valid, y_valid))

In [None]:
lgbm = LGBMClassifier()
lgbm.fit(X_train, y_train)

In [None]:
print(lgbm.score(X_train, y_train))

In [None]:
print(lgbm.score(X_valid, y_valid))

In [None]:
model = Sequential()
model.add(Dense(256, activation='relu', input_dim=X_train.shape[1]))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(128, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy', Precision])

model.summary()

In [None]:
earlystopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
platue = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10)

In [None]:
model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=20, callbacks=[earlystopping])

In [None]:
inp = Input((X_train.shape[1],))

d1 = Dense(512, activation='relu')(inp)
dr = Dropout(0.1)(d1)
d2 = Dense(256, activation='relu')(dr)
d3 = Dense(128, activation='relu')(d2)
d4 = Dense(64, activation='relu')(d3)

out = Dense(3, activation='softmax')(d4)

model = Model([inp], [out])

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

In [None]:
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.1),
    Dense(256, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax'),
])

model.compile(optimizer=Adam(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy', Precision()])

model.summary()

In [None]:
checkpoint = ModelCheckpoint('model.keras')
# earlystopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reducelr = ReduceLROnPlateau(monitor='val_loss', patience=10, factor=0.1)
logger = CSVLogger('model.csv')

In [None]:
hist = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=50, batch_size=128, callbacks=[checkpoint, reducelr, logger])

In [None]:
model.evaluate(X_train, y_train)

In [None]:
model.evaluate(X_valid, y_valid)

In [None]:
tr_acc = hist.history['accuracy']
tr_loss = hist.history['loss']
val_acc = hist.history['val_accuracy']
val_loss = hist.history['val_loss']

epoches = [i+1 for i in range(len(tr_acc))]

In [None]:
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(epoches, tr_loss, color='green', label='Train Loss')
plt.plot(epoches, val_loss, color='red', label='Validation Loss')
plt.title("Train Loss vs Validation Loss")
plt.xlabel('Epoches')
plt.ylabel('Loss')
# plt.xticks(epoches)
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epoches, tr_acc, color='green', label='Train Accuracy')
plt.plot(epoches, val_acc, color='red', label='Validation Accuracy')
plt.title("Train Accuracy vs Validation Accuracy")
plt.xlabel('Epoches')
plt.ylabel('Accuracy')
# plt.xticks(epoches)
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')
test_df

In [None]:
y_pred = lgbm.predict(test_df.iloc[:, 1:])

In [None]:
y_pred = encoder.inverse_transform(y_pred)
y_pred

In [None]:
sample = pd.read_csv('/kaggle/input/playground-series-s4e6/sample_submission.csv')
sample

In [None]:
sample['Target'] = y_pred
sample

In [None]:
sample.to_csv('submission.csv', index=False)