**The objective of this challenge is to forecast the number of turtles caught per week per capture site. This will help Local Ocean plan their staff schedules and budget.**

In [45]:
import pandas as pd

pd.set_option("display.max_columns", 200, "display.max_rows", 200)

In [34]:
turtle = pd.read_csv("train.csv")
varDesc = pd.read_csv("variable_definitions.csv", engine="python")
submissionSample = pd.read_csv("Sample_sub.csv")

In [20]:
turtle.head()

Unnamed: 0,Rescue_ID,Date_TimeCaught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,Tag_1,Tag_2,Lost_Tags,T_Number,CCL_cm,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,ReleaseSite,Date_TimeRelease
0,2000_RE_0060,2000-12-22,Researcher_25,CaptureSite_0,Ocean,Net,Fisher_1072,LandingSite_CaptureSiteCategory_2,Species_6,CC00147,,,,64.7,62.6,,Unknown,algae at rear of shell,Released,ReleaseSite_50,22/12/00
1,2001_RE_0187,2001-10-28,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_520,LandingSite_CaptureSiteCategory_2,Species_6,W442,,,,35.85,31.35,,Unknown,multiple b's on front flippers& a lot of alga...,Released,ReleaseSite_62,28/10/01
2,2001_RE_0197,2001-11-01,Researcher_6,CaptureSite_0,Ocean,Net,Fisher_1669,LandingSite_CaptureSiteCategory_2,Species_5,KE0376,,,,51.8,49.2,,Unknown,clean,Released,ReleaseSite_50,01/11/01
3,2002_RE_0031,2002-03-11,Researcher_32,CaptureSite_0,Ocean,Net,Fisher_1798,LandingSite_CaptureSiteCategory_2,Species_6,CC00302,,,,60.5,59.0,,Unknown,1 b 3 CS+ calcerous algae at rear end of shell...,Released,ReleaseSite_50,11/03/02
4,2002_RE_0118,2002-08-08,Researcher_25,CaptureSite_0,Ocean,Beached,Fisher_1918,LandingSite_CaptureSiteCategory_2,Species_5,NotTagged_0113,,,,34.7,33.0,,Unknown,very lively+ right eye is hanging out + swolle...,Released,ReleaseSite_62,08/08/02


In [91]:
turtle.fillna(value={"Weight_Kg": 0}, inplace=True)

In [33]:
for i in range(varDesc.shape[0]):
    print(varDesc.loc[i, "Variables"])
    print(varDesc.loc[i, "Description"])
    print("\n")

Rescue_ID
It�s an individual bycatch incidence identity number. The numbers are consecutive, for each year e.g. �2018_RE_0732 means rescue number 732 in year 2018


Date_TimeCaught
Date the turtle is captured e.g. 06/01/2018 


Researcher
Name of bycatch officer(s) involved with specific rescue


Capture Site
Area where turtle was captured, as reported by the fisher.


Foraging Ground
General area of ocean area where turtle was captured. The assumption is that the turtle was foraging where it was captured. The foraging area is broadly classified either as the open ocean or creek section


Capture Method
Fishing gear or method used by fishers to capture the turtle


Fisher
Name of the fisher who captured the turtle


Landing_Site
Section of beach where turtle is landed.


Species
Species of turtle (e.g. green turtle, hawksbill, loggerhead etc.)


Tag_1
Individual / unique number used to identify a turtle. Each turtle that is captured is tagged. Formats of tag numbers have been changed o

In [51]:
submissionSample.loc[659:704]

Unnamed: 0,ID,Capture_Number
659,CaptureSite_21_201944,4
660,CaptureSite_22_201901,6
661,CaptureSite_22_201902,4
662,CaptureSite_22_201903,4
663,CaptureSite_22_201904,3
664,CaptureSite_22_201905,8
665,CaptureSite_22_201906,3
666,CaptureSite_22_201907,0
667,CaptureSite_22_201908,3
668,CaptureSite_22_201909,2


In [37]:
turtle["CaptureSite"].unique()

array(['CaptureSite_0', 'CaptureSite_1', 'CaptureSite_10',
       'CaptureSite_11', 'CaptureSite_12', 'CaptureSite_13',
       'CaptureSite_14', 'CaptureSite_15', 'CaptureSite_16',
       'CaptureSite_17', 'CaptureSite_18', 'CaptureSite_19',
       'CaptureSite_2', 'CaptureSite_20', 'CaptureSite_21',
       'CaptureSite_22', 'CaptureSite_23', 'CaptureSite_24',
       'CaptureSite_25', 'CaptureSite_26', 'CaptureSite_27',
       'CaptureSite_28', 'CaptureSite_3', 'CaptureSite_4',
       'CaptureSite_5', 'CaptureSite_6', 'CaptureSite_7', 'CaptureSite_8',
       'CaptureSite_9'], dtype=object)

In [56]:
turtle["Date_TimeCaught"] = pd.to_datetime(turtle["Date_TimeCaught"])
turtle["Date_TimeCaught"].dtype

dtype('<M8[ns]')

In [59]:
turtle["Date_TimeCaughtYear"] = turtle["Date_TimeCaught"].apply(lambda x: x.strftime("%Y"))
turtle["Date_TimeCaughtWeek"] = turtle["Date_TimeCaught"].apply(lambda x: x.strftime("%W"))

turtle[["Date_TimeCaughtYear", "Date_TimeCaughtWeek"]].head()

Unnamed: 0,Date_TimeCaughtYear,Date_TimeCaughtWeek
0,2000,51
1,2001,43
2,2001,44
3,2002,10
4,2002,31


In [81]:
turtle["Tag_1"].value_counts()

None              125
KES1306           116
4858               90
KE6133             81
KE8098             81
                 ... 
KE0838              1
NotTagged_0183      1
KE6047              1
NotTagged_0349      1
KES0236             1
Name: Tag_1, Length: 8236, dtype: int64

In [93]:
millenial = turtle[turtle["Date_TimeCaughtYear"] == "2000"]
millenial.head()

Unnamed: 0,Rescue_ID,Date_TimeCaught,Researcher,CaptureSite,ForagingGround,CaptureMethod,Fisher,LandingSite,Species,Tag_1,Tag_2,Lost_Tags,T_Number,CCL_cm,CCW_cm,Weight_Kg,Sex,TurtleCharacteristics,Status,ReleaseSite,Date_TimeRelease,Date_TimeCaughtWeek,Date_TimeCaughtYear
0,2000_RE_0060,2000-12-22,Researcher_25,CaptureSite_0,Ocean,Net,Fisher_1072,LandingSite_CaptureSiteCategory_2,Species_6,CC00147,,,,64.7,62.6,0.0,Unknown,algae at rear of shell,Released,ReleaseSite_50,22/12/00,51,2000
888,2000_RE_0057,2000-12-16,Researcher_25,CaptureSite_10,Ocean,Net,Fisher_1593,LandingSite_CaptureSiteCategory_2,Species_6,CC00144,,,,43.25,37.7,0.0,Unknown,algae on shell (esp'ly rear)& overlaps worn an...,Released,ReleaseSite_32,16/12/00,50,2000
889,2000_RE_0058,2000-12-18,Researcher_25,CaptureSite_10,Ocean,Net,Fisher_1593,LandingSite_CaptureSiteCategory_2,Species_6,CC00145,,,,46.2,42.6,0.0,Unknown,calcium deposit () 1&2 LLS& algae at rear shell,Released,ReleaseSite_45,18/12/00,51,2000
1367,2000_RE_0001,2000-02-03,Researcher_9,CaptureSite_11,Creek,Not_Recorded,Fisher_1058,LandingSite_CaptureSiteCategory_0,Species_5,NotTagged_0029,,,,45.72,43.18,0.0,Unknown,Slight break in 3rd CS and 2RMS,,ReleaseSite_19,,5,2000
1368,2000_RE_0002,2000-02-18,Researcher_37,CaptureSite_11,Creek,Net,Fisher_1716,LandingSite_CaptureSiteCategory_0,Species_5,Y423,,,,51.0,48.0,0.0,Unknown,"1 barnacle on right rear scute, 1 barnacle on ...",Released,ReleaseSite_66,18/02/00,7,2000


In [94]:
millenial.groupby(["Date_TimeCaughtYear", "Date_TimeCaughtWeek", 
                   "CaptureSite"]).agg({"Rescue_ID": "count", "Researcher": "nunique", 
                                        "ForagingGround": "count", "Fisher": "nunique", 
                                        "Species": "nunique", "CCL_cm": "mean", 
                                        "CCW_cm": "mean", "Weight_Kg": "mean"}).reset_index()

Unnamed: 0,Date_TimeCaughtYear,Date_TimeCaughtWeek,CaptureSite,Rescue_ID,Researcher,ForagingGround,Fisher,Species,CCL_cm,CCW_cm,Weight_Kg
0,2000,5,CaptureSite_11,1,1,1,1,1,45.72,43.18,0.0
1,2000,7,CaptureSite_11,1,1,1,1,1,51.0,48.0,0.0
2,2000,8,CaptureSite_23,1,1,1,1,1,43.18,38.1,0.0
3,2000,9,CaptureSite_11,2,1,2,2,1,44.0,41.5,0.0
4,2000,13,CaptureSite_16,1,1,1,1,1,5.08,5.08,0.0
5,2000,16,CaptureSite_19,1,1,1,1,1,5.08,5.08,0.0
6,2000,17,CaptureSite_16,1,1,1,1,1,43.18,40.64,0.0
7,2000,20,CaptureSite_11,1,1,1,1,1,8.0,8.0,0.0
8,2000,21,CaptureSite_23,1,1,1,1,1,62.0,58.0,0.0
9,2000,23,CaptureSite_11,1,1,1,1,1,9.0,8.0,0.0


In [95]:
turtle["Sex"].unique()

array(['Unknown', nan, 'Female', 'Male', 'Not_Recorded'], dtype=object)