# Different Naive Approaches to get up to 0.20567.
In this notebook we provide 4 additional approaches to achieve relatively high score. All the approaches are based on the provided observation metadata and does not need any training.

## Loading Metadata
First we load all the metadata and import some very basic libraries.

In [1]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# Load train and test metadata
po_train = pd.read_csv("../metadata/PO-metadata-train.csv", dtype={"speciesId":int})
pa_train = pd.read_csv("../metadata/PA-metadata-train.csv", dtype={"speciesId":int})
test = pd.read_csv("../metadata/PA-metadata-test.csv",  dtype={"speciesId":int})

# 🏃💨 Presence Absence Based Approaches

# 🏃💨 First Approach: Top-25 species in PA [**0.11725**]

In [2]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(25).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top25-pa.csv', index=False)
submission

Unnamed: 0,surveyId,predictions
0,642,540 4397 254 4499 10317 2885 1964 10600 10073 ...
1,1792,540 4397 254 4499 10317 2885 1964 10600 10073 ...
2,3256,540 4397 254 4499 10317 2885 1964 10600 10073 ...
3,3855,540 4397 254 4499 10317 2885 1964 10600 10073 ...
4,4889,540 4397 254 4499 10317 2885 1964 10600 10073 ...
...,...,...
4711,3915838,540 4397 254 4499 10317 2885 1964 10600 10073 ...
4712,3916502,540 4397 254 4499 10317 2885 1964 10600 10073 ...
4713,3917793,540 4397 254 4499 10317 2885 1964 10600 10073 ...
4714,3918865,540 4397 254 4499 10317 2885 1964 10600 10073 ...


## 🏃💨 Second Approach: Top-25 species per Country (PA) [**0.16127**]

In [15]:
# !pip install -U ipywidgets swifter

In [12]:
import swifter
tqdm.pandas()

def get_topk_pa_species(country, k=25):
    query = list(pa_train.loc[(pa_train.country == country), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["country"]), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-country-top25.csv", index=False)
submission

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

Unnamed: 0,surveyId,predictions
0,642,254 540 581 1964 2885 4397 4499 4638 6310 6491...
1,1792,262 838 1015 1018 1092 1254 1497 3451 3639 374...
2,3256,146 963 981 1888 2474 4498 4609 4871 5071 5412...
3,3855,963 1677 1736 2386 2474 2715 3125 3166 4659 54...
4,4889,254 540 581 1964 2885 4397 4499 4638 6310 6491...
...,...,...
4711,3915838,254 540 1539 2025 2885 3722 4397 4499 5386 607...
4712,3916502,254 540 1539 2025 2885 3722 4397 4499 5386 607...
4713,3917793,423 1018 1254 1497 1818 3123 3722 4748 4758 53...
4714,3918865,254 540 581 1964 2885 4397 4499 4638 6310 6491...


## 🏃💨💨 Third Approach: Top-25 species per district (PA) [**0.20258**]

In [14]:
import swifter
tqdm.pandas()

def get_topk_pa_species(district, k=25):
    query = list(pa_train.loc[(pa_train.district == district), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"]), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district-top25.csv", index=False)
submission

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

Unnamed: 0,surveyId,predictions
0,642,254 340 540 581 1964 2885 4397 4499 4638 6310 ...
1,1792,0
2,3256,0
3,3855,53 423 963 1162 2184 2474 2799 4109 4112 4734 ...
4,4889,254 540 581 843 958 963 1910 1964 2025 2142 28...
...,...,...
4711,3915838,254 540 581 1170 2025 2885 3361 4483 4492 4499...
4712,3916502,340 540 1545 1716 2025 2761 2885 2922 3226 385...
4713,3917793,1092 1139 1254 1851 2747 3043 5146 5412 6208 6...
4714,3918865,53 254 791 843 963 1964 2025 2823 3294 5114 51...


## 🏆 Fourth Approach: Top-25 species per district & biogeographical zones (PA) [**0.20515**]

In [16]:
def get_topk_pa_species(district, region, k=25):
    query = list(pa_train.loc[(pa_train.region == region) & (pa_train.district == district), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"]), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district+region.csv", index=False)
submission

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

Unnamed: 0,surveyId,predictions
0,642,254 340 540 581 1964 2885 4397 4499 4638 6310 ...
1,1792,0
2,3256,0
3,3855,53 423 963 1162 2184 2474 2799 4109 4112 4734 ...
4,4889,254 540 958 963 976 1910 1964 2025 2142 2398 2...
...,...,...
4711,3915838,254 540 581 1170 2025 2885 3361 4483 4492 4499...
4712,3916502,340 540 1545 1716 2025 2761 2885 2922 3226 385...
4713,3917793,963 1092 1139 1254 1851 2747 3043 5412 6208 66...
4714,3918865,53 254 791 843 963 1964 2025 2823 3294 5114 51...


# 🏃💨 Presence Only Based Approaches

## 🏃💨 Second Approach: Top-25 species in PO [**0.08217**]

In [3]:
test['predictions'] = ' '.join([str(species) for species in po_train.speciesId.value_counts().nlargest(25).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top25-po.csv', index=False)
submission

Unnamed: 0,surveyId,predictions
0,642,1757 2885 10247 3958 10047 8208 9816 2474 540 ...
1,1792,1757 2885 10247 3958 10047 8208 9816 2474 540 ...
2,3256,1757 2885 10247 3958 10047 8208 9816 2474 540 ...
3,3855,1757 2885 10247 3958 10047 8208 9816 2474 540 ...
4,4889,1757 2885 10247 3958 10047 8208 9816 2474 540 ...
...,...,...
4711,3915838,1757 2885 10247 3958 10047 8208 9816 2474 540 ...
4712,3916502,1757 2885 10247 3958 10047 8208 9816 2474 540 ...
4713,3917793,1757 2885 10247 3958 10047 8208 9816 2474 540 ...
4714,3918865,1757 2885 10247 3958 10047 8208 9816 2474 540 ...


## 🏆 Fourth Approach: Top-25 species per district & biogeographical zones (PO) [**xxxxx**]

In [21]:
def get_topk_pa_species(district, region, k=25):
    query = list(po_train.loc[(po_train.region == region) & (po_train.district == district), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"]), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-po-district+region.csv", index=False)
submission

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

Unnamed: 0,surveyId,predictions
0,642,254 340 540 623 1288 2101 2676 2955 3123 4397 ...
1,1792,320 535 846 1254 1869 1920 2348 2499 2723 4233...
2,3256,0
3,3855,460 617 944 2644 2782 3510 4105 4216 4371 4774...
4,4889,300 581 976 1264 1951 1964 2262 3979 4447 4483...
...,...,...
4711,3915838,296 442 1254 1472 2027 2788 3449 3583 3864 478...
4712,3916502,121 296 1215 1472 1531 1716 2027 2173 3583 385...
4713,3917793,282 569 1139 1597 2747 2776 3043 3470 3594 417...
4714,3918865,53 300 1611 1893 1951 2823 3227 4735 4996 5114...


In [26]:
import swifter
tqdm.pandas()

def get_topk_pa_species(region, k=25):
    query = list(po_train.loc[(po_train.region == region), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["region"]), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-po-region-top25.csv", index=False)
submission

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

Unnamed: 0,surveyId,predictions
0,642,340 581 1288 2664 2847 3861 4447 5398 5928 627...
1,1792,460 569 944 2322 2644 2782 2837 3156 4105 4216...
2,3256,460 569 944 2322 2644 2782 2837 3156 4105 4216...
3,3855,460 569 944 2322 2644 2782 2837 3156 4105 4216...
4,4889,340 581 1288 2664 2847 3861 4447 5398 5928 627...
...,...,...
4711,3915838,1757 1939 2027 2126 2474 2543 2707 2752 2885 3...
4712,3916502,1757 1939 2027 2126 2474 2543 2707 2752 2885 3...
4713,3917793,460 569 944 2322 2644 2782 2837 3156 4105 4216...
4714,3918865,340 581 1288 2664 2847 3861 4447 5398 5928 627...
