# Different Naive Approaches to get up to 0.20567.
In this notebook we provide 4 naive approaches to achieve relatively high score. All the approaches are based on the provided observation metadata and does not need any training.

## Loading Metadata
First we load all the metadata and import some very basic libraries.

In [2]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# Load train and test metadata
po_train = pd.read_csv("../metadata/PO_metadata_train.csv", dtype={"speciesId":int})
pa_train = pd.read_csv("../metadata/PA_metadata_train.csv", dtype={"speciesId":int})
test = pd.read_csv("../metadata/PA_metadata_test.csv",  dtype={"speciesId":int})

# 🏃💨 Presence Absence Based Approaches

# Top-k experiment within all Presence-Absence data

In [11]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(5).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top5-pa.csv', index=False)

In [12]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(10).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top10-pa.csv', index=False)

In [13]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(15).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top15-pa.csv', index=False)

In [14]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(20).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top20-pa.csv', index=False)

In [ ]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(25).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top25-pa.csv', index=False)

In [21]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(30).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top30-pa.csv', index=False)

In [22]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(35).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top35-pa.csv', index=False)

In [28]:
test['predictions'] = ' '.join([str(species) for species in pa_train.speciesId.value_counts().nlargest(40).index])

submission = test[['surveyId','predictions']]
submission.to_csv('../submissions/submission-top40-pa.csv', index=False)

# Top-k experiment with district

In [30]:
def get_topk_pa_species(district, k=25):
    query = list(pa_train.loc[(pa_train.district == district), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

In [6]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], k=5), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district-top5.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [7]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], k=10), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district-top10.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [8]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], k=15), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district-top15.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [9]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], k=20), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district-top20.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [ ]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], k=25), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district-top25.csv", index=False)

In [10]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], k=30), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district-top30.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [25]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], k=35), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district-top35.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [31]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], k=40), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district-top40.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

# Top-k experiment with district & bio-region

In [35]:
def get_topk_pa_species(district, region, k=25):
    query = list(pa_train.loc[(pa_train.region == region) & (pa_train.district == district), "speciesId"].value_counts().nlargest(k).index)
    query.sort()
    return " ".join([str(c) for c in query]) if len(query) > 0 else "0"

In [39]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"], k=5), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district+region-top5.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [40]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"], k=10), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district+region-top10.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [41]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"], k=15), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district+region-top15.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [42]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"], k=20), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district+region-top20.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [ ]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"], k=25), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district+region-top25.csv", index=False)

In [36]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"], k=30), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district+region-top30.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [37]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"], k=35), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district+region-top35.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]

In [38]:
test["predictions"] = test.swifter.apply(lambda test_observation: get_topk_pa_species(test_observation["district"], test_observation["region"], k=40), axis=1)
submission = test[["surveyId","predictions"]]

submission.to_csv("../submissions/submission-pa-district+region-top40.csv", index=False)

Pandas Apply:   0%|          | 0/4716 [00:00<?, ?it/s]