In [None]:
%matplotlib inline
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/train.csv", index_col='row_id')
test = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2022/test.csv", index_col='row_id')

In [None]:
train_dd = train.drop_duplicates().reset_index()
train_wt = train.value_counts().reset_index(drop=True)

In [None]:
train_dd,train_wt

In [None]:
from math import factorial
def bias(w, x, y, z):
    return factorial(10) / (factorial(w) * factorial(x) * factorial(y) * factorial(z) * 4**10)

import re
hist_re = re.compile("A(\d+)T(\d+)G(\d+)C(\d+)")
biases = {col:bias(*[int(i) for i in hist_re.match(col).groups()]) for col in train.columns[:-1]}

In [None]:
train_i = pd.DataFrame({col: ((train_dd[col] + biases[col]) * 1000000).round().astype(int) for col in train.columns[:-1]})
test_i = pd.DataFrame({col: ((test[col] + biases[col]) * 1000000).round().astype(int) for col in test.columns})
train_i.sum(axis=1).head()

In [None]:
train_dd = train_dd.assign(gcd = np.gcd.reduce(train_i, axis=1))
test = test.assign(gcd = np.gcd.reduce(test_i, axis=1))
train_dd.head()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_dd["target"])

In [None]:
from sklearn import decomposition
from matplotlib import pyplot
xy = decomposition.PCA(n_components=2,whiten=True).fit_transform(train_i)
pyplot.scatter(xy[:,0],xy[:,1],c=le.transform(train_dd["target"]))

In [None]:
for scale in np.sort(train_dd['gcd'].unique()):
    # Compute the PCA
    pca = decomposition.PCA(whiten=True, random_state=1)
    pca.fit(train_i[train_dd['gcd'] == scale])

    # Transform the data so that the components can be analyzed
    Xt_tr = pca.transform(train_i[train_dd['gcd'] == scale])
    Xt_te = pca.transform(test_i[test['gcd'] == scale])

    # Plot a scattergram, projected to two PCA components, colored by classification target
    pyplot.figure(figsize=(6,6))
    pyplot.scatter(Xt_tr[:,0], Xt_tr[:,1], c=le.transform(train_dd.loc[train_dd['gcd'] == scale,"target"]), s=1)
    pyplot.title(f"{1000000 // scale} decamers ({(train_dd['gcd'] == scale).sum()} samples with gcd = {scale})")
    pyplot.show()

So maybe we should just train on the ones with GCD=1?

Or train separate classifiers depending on the GCD?

In [None]:
gcds = train_dd["gcd"].unique()
X_split = {g:train_i.loc[train_dd["gcd"] == g] for g in gcds}
y_split = {g:train_dd.loc[train_dd["gcd"] == g, "target"] for g in gcds}
wt_split = {g:train_wt.loc[train_dd["gcd"] == g] for g in gcds}
test_split = {g:test_i.loc[test["gcd"] == g] for g in gcds}
X_split[10].shape, y_split[10].shape, wt_split[10].shape

In [None]:
from sklearn import pipeline, preprocessing, ensemble, model_selection

#grid_params = {
#    'extratreesclassifier__n_estimators': [50,100,250,500,1000]
#}

#model_split = {g:model_selection.GridSearchCV(
#    pipeline.make_pipeline(preprocessing.StandardScaler(),
#    ensemble.ExtraTreesClassifier()), grid_params, cv=5) for g in gcds}

### parameters extracted by GridSearchCV

n_estimators = {
    1:50,
    10:100,
    1000:1000,
    10000:1000
}
model_split = {g:pipeline.make_pipeline(preprocessing.StandardScaler(),
    ensemble.ExtraTreesClassifier(n_estimators=n_estimators[g])) for g in gcds}

for g in gcds:
    model_split[g].fit(X_split[g], y_split[g], extratreesclassifier__sample_weight=wt_split[g])
    #print(f"Model on GCD={g}, accuracy: {model_split[g].best_score_}")


In [None]:
#for g in gcds:
    #model_split[g].fit(X_split[g],y_split[g],extratreesclassifier__sample_weight=wt_split[g])
    #print(f"Model on GCD={g}, parameters: {model_split[g].best_params_}")

In [None]:
submission = test[[]].assign(target="")

for g in gcds:
    submission.loc[test["gcd"] == g, "target"] = model_split[g].predict(test_i.loc[test["gcd"] == g])

In [None]:
submission

In [None]:
#test[["row_id"]].assign(target=ypred).set_index("row_id").to_csv("submission.csv")
submission.to_csv("submission.csv")

In [None]:
nan