In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
def load_dataset():
    # load the dataset as a numpy array
    X = pd.read_csv('clean/X_train.csv').values
    y = pd.read_csv('clean/y_train.csv').values
    y = LabelEncoder().fit_transform(y)
    return X, y

X, y = load_dataset()
pd.DataFrame(y).value_counts()

7     6772
0     6617
4     5276
3     3594
6     2917
10    1582
1     1112
8     1050
2      903
9      861
5      661
11     207
dtype: int64

In [4]:
from collections import Counter
counter = Counter(y)
for k, v in counter.items():
    per = v / len(y) * 100
    print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

Class=0, n=6617 (20.972%)
Class=4, n=5276 (16.722%)
Class=8, n=1050 (3.328%)
Class=7, n=6772 (21.463%)
Class=11, n=207 (0.656%)
Class=10, n=1582 (5.014%)
Class=3, n=3594 (11.391%)
Class=6, n=2917 (9.245%)
Class=1, n=1112 (3.524%)
Class=2, n=903 (2.862%)
Class=5, n=661 (2.095%)
Class=9, n=861 (2.729%)


In [4]:
def evaluate_model(X, y, model):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return np.mean(scores)

In [5]:
model = RandomForestClassifier(n_estimators=100, class_weight='balanced')

In [6]:
scores = evaluate_model(X, y, model)
# summarize performance


In [7]:
print('Mean Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean Accuracy: 0.436 (0.000)


In [8]:
model.fit(X, y)

RandomForestClassifier(class_weight='balanced')

In [20]:
test = pd.read_csv('clean/X_test.csv')
test

Unnamed: 0,Role,Invested in Real Estate,Extra Hours,Invested in Stock Market,Favourite Sport,Gender,Languages Known,Sport Knowledge (in XP),Average no. of leaves/month,Trades Done,Married(1/0),Level of Education,Pay,ID Proof,Most Productive Hour,Tax Category,Income Category,Employer,Tax Paid,Id
0,0,1,0,1,3,1,3,51.0,4,33,0,2,504.73,1,8,136,2,0,47302,5
1,1,1,1,1,5,1,3,93.0,3,26,1,2,720.73,1,8,214,2,0,45732,6
2,0,1,0,1,3,1,1,57.0,4,27,1,0,663.73,1,8,134,2,0,22727,11
3,0,1,0,1,4,0,2,79.0,5,51,0,0,279.73,1,8,514,2,0,40778,12
4,1,1,0,1,2,0,4,41.0,0,56,0,0,344.73,1,8,68,2,0,35447,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13654,1,1,0,0,6,1,4,36.0,5,73,-1,2,4083.73,0,9,259,3,0,16241,45191
13655,1,1,0,0,0,1,3,1.0,0,27,1,2,1038.73,0,9,235,2,0,25361,45192
13656,0,0,1,0,3,1,1,48.0,1,26,1,2,3577.73,0,9,446,6,0,15414,45193
13657,0,0,0,0,2,0,1,15.0,0,51,0,2,856.73,0,9,223,6,0,31697,45201


In [10]:
preds = pd.DataFrame(model.predict(test))
print(preds.value_counts())
pd.DataFrame({
    'Id': test['Id'],
    'Occupation': preds[0]
}).to_csv('output/last_preds.csv', index = False)

7     5167
0     4074
4     1824
3     1066
10     870
6      396
5      205
9       41
2       11
1        4
8        1
dtype: int64


In [11]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth = 20, class_weight='balanced')
]
X, y = load_dataset()
for model in models:
    score = evaluate_model(X, y, model)
    print(score)

  return f(*args, **kwargs)


0.4338657035908508


In [5]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from catboost import CatBoostClassifier

In [6]:
X_train = pd.read_csv('clean/X_train.csv')
y_train = pd.read_csv('clean/y_train.csv')
X_test = pd.read_csv('clean/X_test.csv')
y_train.value_counts()

Occupation
7             6772
0             6617
4             5276
3             3594
6             2917
10            1582
1             1112
8             1050
2              903
9              861
5              661
11             207
dtype: int64

In [7]:
import datetime
now = datetime.datetime.now()
print (now.strftime("%H:%M:%S"))

10:49:38


In [18]:
def make_sub(name, preds):
    print('Making submission ...')
    now = datetime.datetime.now()
    name = 'LAST_'+ name + '_' + now.strftime("%H:%M:%S")
    print(f'File name = {name}')
    file_name = f'output/{name}.csv'
    pd.DataFrame({
        'Id': test['Id'],
        'Occupation': preds
    }).to_csv(file_name, index = False)

In [9]:
class_weights = compute_class_weight('balanced', np.sort(y_train['Occupation'].unique()), y_train['Occupation'].values)
print(class_weights)

[ 0.39736033  2.36450839  2.91177556  0.73158969  0.49835734  3.9778114
  0.9013827   0.38826541  2.50412698  3.0538134   1.66203118 12.7020934 ]




In [10]:
# Initializing CatBoost classifier, fitting and then predicting
cat_model = CatBoostClassifier(iterations=100, learning_rate=0.3, depth=5, loss_function='MultiClass', classes_count=12, logging_level='Silent', l2_leaf_reg=2, thread_count=4, class_weights=class_weights)
cat_model.fit(X_train, y_train)
cat_preds = cat_model.predict(X_test)
cat_preds = cat_preds.reshape((-1,)).astype(int)
print(set(cat_preds))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}


In [11]:
# Initializing Random Forest classifier, fitting and then predicting
rfc_clf = RandomForestClassifier(n_estimators=70, max_depth=18, max_features=0.8, n_jobs=4, class_weight='balanced')
rfc_clf.fit(X_train, y_train)
rfc_preds = rfc_clf.predict(X_test).astype(int)
print(set(rfc_preds))

  rfc_clf.fit(X_train, y_train)


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}


In [12]:
# Initializing Adam Boost classifier, fitting and then predicting
ada_clf = AdaBoostClassifier(n_estimators=70, learning_rate=0.3)
ada_clf.fit(X_train, y_train, sample_weight=[class_weights[int(y_train.values[i])] for i in range(y_train.shape[0])])
ada_preds = ada_clf.predict(X_test).astype(int)
print(set(ada_preds))

  return f(*args, **kwargs)


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}


In [13]:
[class_weights[int(y_train.values[i])] for i in range(y_train.shape[0])][:5]

[0.3973603344919651,
 0.4983573414202679,
 2.504126984126984,
 0.38826540657609765,
 12.702093397745571]

In [14]:
# Initializing Bernoulli naive-bayes classifier, fitting and then predicting
bernoulli_clf = BernoulliNB()
bernoulli_clf.fit(X_train, y_train, sample_weight=[class_weights[int(y_train.values[i])] for i in range(y_train.shape[0])])
bernoulli_preds = bernoulli_clf.predict(X_test).astype(int)
print(set(bernoulli_preds))

{0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11}


  return f(*args, **kwargs)


In [15]:
# Initializing Gaussian naive-bayes classifier, fitting and then predicting
gaussian_clf = GaussianNB()
gaussian_clf.fit(X_train, y_train, sample_weight=[class_weights[int(y_train.values[i])] for i in range(y_train.shape[0])])
gaussian_preds = gaussian_clf.predict(X_test).astype(int)
print(set(gaussian_preds))

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}


  return f(*args, **kwargs)


In [16]:
# Initializing KNN classifier, fitting and then predicting
knn_clf = KNeighborsClassifier(n_neighbors=8, weights='uniform', n_jobs=4)
knn_clf.fit(X_train, y_train)
knn_preds = knn_clf.predict(X_test).astype(int)
print(set(knn_preds))

  return self._fit(X, y)


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}


In [17]:
# Initializing Multilayer Perceptron, fitting and then predicting
mlp_clf = MLPClassifier(hidden_layer_sizes=(50), batch_size=50, learning_rate='constant', learning_rate_init=0.0005, early_stopping=True)
mlp_clf.fit(X_train, y_train)
mlp_preds = mlp_clf.predict(X_test).astype(int)
print(set(mlp_preds))

  return f(*args, **kwargs)


{0, 1, 3, 4, 6, 7, 8, 10, 11}


In [21]:
N = 7
models = [
    cat_model,
    rfc_clf,
    ada_clf,
    bernoulli_clf,
    gaussian_clf,
    knn_clf,
    mlp_clf
]

predictions = [
    ('catboost', cat_preds),
    ('random_forest', rfc_preds),
    ('adaboost', ada_preds),
    ('bernoulli', bernoulli_preds),
    ('gaussian', gaussian_preds),
    ('knn', knn_preds),
    ('mlp', mlp_preds)
]

def evaluate_model(X, y, model):
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    return np.mean(scores)

for i in range(N):
    name, preds = predictions[i]
    score = evaluate_model(models[i])
    print(f'Model = {name}, score = {score}')
    make_sub(name + str(score), )

Model = catboost
Making submission ...
File name = LAST_catboost_10:52:59
Model = random_forest
Making submission ...
File name = LAST_random_forest_10:52:59
Model = adaboost
Making submission ...
File name = LAST_adaboost_10:52:59
Model = bernoulli
Making submission ...
File name = LAST_bernoulli_10:52:59
Model = gaussian
Making submission ...
File name = LAST_gaussian_10:52:59
Model = knn
Making submission ...
File name = LAST_knn_10:52:59
Model = mlp
Making submission ...
File name = LAST_mlp_10:52:59


In [None]:
for name, preds in predictions: