In [1]:
INPUT_PREFIX = 'input/tabular-playground-series-may-2021'
RANDOM_STATE = 42
TEST_SPLIT_SIZE = 0.3

TG = 'target'

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import warnings

from scipy.stats import chi2
from collections import Counter
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.ensemble import VotingClassifier

from sklearn.model_selection import train_test_split

%matplotlib inline

warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk(INPUT_PREFIX):
    for filename in filenames:
        print(os.path.join(dirname, filename))

input/tabular-playground-series-may-2021/test.csv
input/tabular-playground-series-may-2021/train.csv


In [3]:
train_data = pd.read_csv(f'{INPUT_PREFIX}/train.csv')
test_data = pd.read_csv(f'{INPUT_PREFIX}/test.csv')

## Training data

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
classes = train_data[TG].unique()

classes

In [None]:
print(train_data[TG].value_counts())
sns.catplot(x = TG,data=train_data, kind='count',palette='rocket')

In [None]:
plt.figure(figsize=(10, 6))
plt.pie(train_data.groupby('target')['id'].sum(), autopct='%1.1f%%')
plt.show()

## Test data

In [None]:
test_data.head()

## Train action

In [None]:
CLASSES = {
    "Class_1":10,
    "Class_2":20,
    "Class_3":30,
    "Class_4":40
}

data_to_split = train_data.copy()
common_y = data_to_split[TG].map(CLASSES)

data_to_split.drop([TG], axis=1, inplace=True)

X_train, X_test, Y_train, Y_test = train_test_split(data_to_split, common_y, test_size=TEST_SPLIT_SIZE, random_state=RANDOM_STATE)

X_train_ids = X_train['id'].copy()
X_test_ids = X_test['id'].copy()

X_train.drop(['id'], axis=1, inplace=True)
X_test.drop(['id'], axis=1, inplace=True)

Y_test

In [None]:
NAIVE=GaussianNB()
LR = LogisticRegression(max_iter=1000, multi_class='ovr')
RF = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
Ensemble = VotingClassifier( estimators= [('NB',NAIVE),('LR',LR),('RF',RF)], voting = 'soft')

ovr = OneVsRestClassifier(Ensemble)

ovr.fit(X_train,Y_train)

predicted = ovr.predict_proba(X_test)
print(ovr.classes_)

pd.DataFrame(predicted)

In [None]:
def rev(id):
    if id == 10:
        return 'Class_1'
    elif id == 20:
        return 'Class_2'
    elif id == 30:
        return 'Class_3'
    else:
        return 'Class_4'
    
columns = [rev(v) for v in ovr.classes_]

output = pd.DataFrame(predicted, columns=columns)
output['id'] = X_test_ids.array
output['original'] = [rev(v) for v in Y_test.array]
output['best'] = output[['Class_1', 'Class_2', 'Class_3', 'Class_4']].idxmax(axis=1)

output

### Lets check

In [None]:
predicted_ok = output[output['best'] == output['original']].shape[0]
predicted_wrong = output.shape[0] - predicted_ok
total = output.shape[0]

print(f'Right ({predicted_ok}) / Wrong ({predicted_wrong}) / Size ({total})')
print(f'Score: {predicted_ok/total}')

print(output['best'].value_counts())
sns.catplot(x = 'best',data=output, kind='count',palette='rocket')

## Production run

In [5]:
def get_elements(data, need_y=True):
    ids = data['id']
    data = data.drop(['id'], axis=1)
    
    if need_y:
        y = data['target']
        data = data.drop(['target'], axis=1)
    else:
        y = []
    
    return data, ids, y

X_train, X_train_ids, Y_train = get_elements(train_data)
X_test, X_test_ids, _ = get_elements(test_data, False)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import VotingClassifier

KNN=KNeighborsClassifier(5)
NAIVE=GaussianNB()
SVM=SVC()
DT=DecisionTreeClassifier()
LR = LogisticRegression(max_iter=1000, multi_class='multinomial', n_jobs=2)
RF = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1, n_jobs=2)
Ensemble = VotingClassifier( estimators= [('KNN',KNN),('NB',NAIVE),('SVM',SVM),('DT',DT),('LR',LR),('RF',RF)], voting = 'soft', n_jobs=2)

ovr = OneVsRestClassifier(Ensemble, n_jobs=2)

ovr.fit(X_train, Y_train)

predicted = ovr.predict_proba(X_test)

print('Classes: ', ovr.classes_)

output = pd.DataFrame(predicted, columns=ovr.classes_)
output['id'] = X_test_ids.array

output.to_csv(f'output/my_submission.csv', index=False)

output.head()

AttributeError: predict_proba is not available when  probability=False