In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Load packages
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

In [3]:
# Load training data
df = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv") 
# check for missing values
#df.isnull().sum()
df.head()

In [4]:
# Split the data into X and y
output_col = ['target']
X = df.drop(['id', 'target'], axis=1)
y = df[output_col]
#y.head()

In [5]:
#fScale X values with StandardScaler
#scaler = StandardScaler()
scaler = StandardScaler()
sclX = scaler.fit_transform(X)

sclX = pd.DataFrame(sclX)
#columns=['a', 'b', 'c']
sclX.head()

In [6]:
# Lasso (L1) Regression: Plot is flat 
from sklearn.linear_model import Lasso

df_columns = sclX.columns

# Instantiate a lasso regressor: lasso
lasso = Lasso(alpha=0.4, normalize=True)

# Fit the regressor to the data
lasso.fit(sclX,y)

# Compute and print the coefficients
lasso_coef = lasso.fit(sclX,y).coef_
print(lasso_coef)

# Plot the coefficients
plt.plot(range(len(df_columns)), lasso_coef)
plt.xticks(range(len(df_columns)), df_columns.values, rotation=60)
plt.margins(0.02)
plt.show()

In [7]:
# Split the data into training and test data
X_train, X_test, y_train, y_test =  train_test_split(sclX,y,test_size = 0.30, random_state= 44)

# Choose the criterion and max depth of the tree you want to use
CRITERION = 'gini'
MAX_DEPTH = 3

# Set up the DT classifier
dt_clf = DecisionTreeClassifier(criterion=CRITERION, max_depth=MAX_DEPTH, random_state=43)

# Train the DT classifier
dt_clf.fit(X_train, y_train)

# Evaluate the DT on the test set
y_pred = dt_clf.predict(X_test)
print(f'Model accuracy score with criterion {CRITERION} index: {accuracy_score(y_test, y_pred):.4f}')

In [8]:
import xgboost as xgb # XGBoost typically uses the alias "xgb"

# Instatiate a XGBClassifier 
xgb_clf = xgb.XGBClassifier(n_estimators= 12, random_state=43, eval_metric='mlogloss')

# Inspect the parameters
#xgb_clf.get_params()

# make predictions for test data
xgb_clf.fit(X_train, y_train)

# Evaluate the DT on the test set
y_pred_xg = xgb_clf.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred_xg)
print("Accuracy: %.2f%%" % (accuracy * 100))
# print("Baseline accuracy:", accuracy)

In [9]:
# Checking feature importance
import matplotlib
matplotlib.rcParams['figure.figsize'] = (10.0, 16)

# Plot feature importance
xgb.plot_importance(xgb_clf)

In [10]:
# Plot gain instead of weight
xgb.plot_importance(xgb_clf, importance_type="gain")

In [11]:
# RandomSearch CV

#from sklearn.model_selection import RandomizedSearchCV

# Define a parameter grid
#rs_param_grid = {
     #max_depth: values from 3 to 12
#    'max_depth': list((range(3,12))),
     #alpha: values 0, .001, .01, .1
#    'alpha': [0,0.001, 0.01,0.1,1],
     #subsample: values 0.25,0.5,0.75, 1
#    'subsample': [0.5,0.75,1],
     #learning rate: ten values between 0.01 - 0.5
#    'learning_rate': np.linspace(0.01,0.5, 10),
     #n_estimators: values 10, 25, 40
#    'n_estimators': [10, 25, 40]
#    }


# Insantiate XGBoost Clasifier 
#xgb_clf_rs = xgb.XGBClassifier(eval_metric='mlogloss', random_state=43)

# Instantiate RandomizedSearchCV()
#xgb_rs = RandomizedSearchCV(estimator=xgb_clf_rs,param_distributions=rs_param_grid, 
#                                cv=3, n_iter=5, verbose=2, random_state=43)

# Train the model on the training set
#xgb_rs.fit(X_train, y_train)

# Print the best parameters and highest accuracy
#print("Best parameters found: ", xgb_rs.best_params_)
#print("Best accuracy found: ", xgb_rs.best_score_)

In [12]:
#instantiate final model
final_xgb_clf = xgb.XGBClassifier(eval_metric='mlogloss', random_state=43, 
                               subsample= 1, n_estimators= 40, max_depth= 6,
                               learning_rate= 0.44555555555555554, alpha= 0.01)
# make predictions for test data
final_xgb_clf.fit(X_train, y_train)

# Evaluate the DT on the test set
y_pred_xg_final = final_xgb_clf.predict(X_test)

# evaluate predictions
accuracy = accuracy_score(y_test, y_pred_xg_final)
print("Accuracy: %.2f%%" % (accuracy * 100))

In [13]:
# Load test data
test_df = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv") 
test_df.head()

In [14]:
# Split the data into X and y
final_X = test_df.drop(['id'], axis=1)
# y = test_df.assign(target = y_pred_test)
#final_X.head()

In [15]:
# Evaluate the DT on the test set
final_y_pred = final_xgb_clf.predict(final_X)

output = pd.DataFrame({'id': test_df.id, 'target': final_y_pred})
output.to_csv('./submission.csv', index=False)
print("Your submission was successfully saved!")
output.info

In [16]:
# review sample submission data to check before submission
# Load data
#df = pd.read_csv("../input/tabular-playground-series-nov-2021/sample_submission.csv") 
#df.head()