In [71]:
import pandas as pd
from skbio.diversity import beta_diversity
from skbio.stats.distance import permanova
from skbio.stats.distance import permdisp
import numpy as np

In [72]:
periods = [('2024-07-11', '2024-07-17'),  # PERIODS FOR JUST CLOVER
           ('2024-07-18', '2024-07-23'), 
           ('2024-07-24', '2024-07-29')
           ]
periods = [(pd.to_datetime(start).date(), pd.to_datetime(end).date()) for start, end in periods]

def which_period(date):
    for i, (start, end) in enumerate(periods, start=1):
        if start <= date <= end:
            return str(i)
    return np.nan

In [73]:
# ASSIGN PERIOD NUMBER TO IMAGE COUNTS DURING CLOVER SAMPLING

pi_date_color_counts = pd.read_csv('csvs/pi_date_color_counts.csv', index_col=False)
pi_date_color_counts['date'] = pd.to_datetime(pi_date_color_counts['date']).dt.date
pi_date_color_counts['count_per_10k'] = pi_date_color_counts['count']/10000

threshold = pd.to_datetime('2024-07-30').date()
pi_date_color_counts_clover = pi_date_color_counts[pi_date_color_counts['date'] < threshold]
pi_date_color_counts_clover['period'] = pi_date_color_counts_clover['date'].apply(which_period)
pi_date_color_counts_clover

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pi_date_color_counts_clover['period'] = pi_date_color_counts_clover['date'].apply(which_period)


Unnamed: 0,color,date,pi,count,count_per_10k,period
0,blu,2024-07-11,8,18400,1.8400,1
1,blu,2024-07-12,8,18400,1.8400,1
2,blu,2024-07-13,3,8255,0.8255,1
3,blu,2024-07-13,8,18269,1.8269,1
4,blu,2024-07-14,3,35091,3.5091,1
...,...,...,...,...,...,...
454,yel,2024-07-27,8,46365,4.6365,3
455,yel,2024-07-28,4,46381,4.6381,3
456,yel,2024-07-28,8,46429,4.6429,3
457,yel,2024-07-29,4,46398,4.6398,3


In [74]:
# ASSIGN PERIOD TO VISIT TIMES

visit_durations_clover = pd.read_csv('csvs/visit_durations_clover.csv', index_col=False)
visit_durations_clover = visit_durations_clover[visit_durations_clover['species'] != "unk"] # IGNORE UNK BEES
visit_durations_clover['date'] = pd.to_datetime(visit_durations_clover['visit_start']).dt.date
visit_durations_clover['period'] = visit_durations_clover['date'].apply(which_period)

visit_durations_clover

Unnamed: 0,species,visit_id,pi,color,visit_start,visit_end,duration,pi_color,date,period
0,fervidus,67.0,9,grn,2024-07-11 07:28:42,2024-07-11 07:28:42,0.0,9_grn,2024-07-11,1
1,vosnesenskii,32.0,2,blv,2024-07-11 07:51:19,2024-07-11 07:51:19,0.0,2_blv,2024-07-11,1
2,vosnesenskii,79.0,6,cnt,2024-07-11 08:09:09,2024-07-11 08:09:09,0.0,6_cnt,2024-07-11,1
3,fervidus,28.0,4,grn,2024-07-11 08:40:37,2024-07-11 08:40:37,0.0,4_grn,2024-07-11,1
4,vosnesenskii,1.0,1,cnt,2024-07-11 08:43:51,2024-07-11 08:43:51,0.0,1_cnt,2024-07-11,1
...,...,...,...,...,...,...,...,...,...,...
274,vosnesenskii,119.0,8,yel,2024-07-29 12:43:35,2024-07-29 12:45:46,131.0,8_yel,2024-07-29,3
275,vosnesenskii,30.0,1,blv,2024-07-29 13:29:24,2024-07-29 13:29:24,0.0,1_blv,2024-07-29,3
276,vosnesenskii,141.0,9,grn,2024-07-23 09:41:43,2024-07-23 10:02:38,1255.0,9_grn,2024-07-23,2
277,vosnesenskii,142.0,9,grn,2024-07-23 10:02:19,2024-07-23 10:07:09,290.0,9_grn,2024-07-23,2


In [75]:
# TOTAL IMAGES PER PI EACH PERIOD

im_tot = (pi_date_color_counts_clover
          .groupby(['pi', 'period'], as_index=False)['count']
          .sum()
          .rename(columns={'count': 'n_images'}))
im_tot

Unnamed: 0,pi,period,n_images
0,1,1,88746
1,1,2,216546
2,1,3,276913
3,2,1,84102
4,2,2,184806
5,2,3,278031
6,3,1,83608
7,3,2,173314
8,3,3,278238
9,4,1,102004


In [76]:
# FOR EACH PI AND PERIOD, HOW MANY TIMES DID EACH SPECIES VISIT

n_visits = (visit_durations_clover
          .groupby(['pi', 'period', 'species'])
          .size()
          .reset_index(name='n_visits'))
n_visits

Unnamed: 0,pi,period,species,n_visits
0,1,1,fervidus,18
1,1,1,griseocollis,1
2,1,1,vosnesenskii,17
3,1,2,vosnesenskii,7
4,1,3,appositus,1
...,...,...,...,...
59,10,1,vosnesenskii,5
60,10,2,nevadensis,1
61,10,2,vosnesenskii,7
62,10,3,griseocollis,1


In [77]:
# PIVOT WIDE, COLUMNS ARE SPECIES

wide = (n_visits
        .pivot_table(index=['pi', 'period'],
                     columns='species',
                     values='n_visits',
                     fill_value=0)
        .reset_index())

wide = wide.merge(im_tot, on=['pi', 'period'], how='left')

species_cols = [c for c in wide.columns if c not in ['pi', 'period', 'n_images']]
wide[species_cols] = (wide[species_cols]
                      .div(wide['n_images'], axis=0)
                      .mul(10000))
wide

Unnamed: 0,pi,period,appositus,fervidus,griseocollis,nevadensis,vosnesenskii,n_images
0,1,1,0.0,2.02826,0.112681,0.0,1.915579,88746
1,1,2,0.0,0.0,0.0,0.0,0.323257,216546
2,1,3,0.036112,0.072225,0.0,0.0,0.216675,276913
3,2,1,0.0,0.237806,0.118903,0.0,0.594516,84102
4,2,2,0.0,0.0,0.054111,0.0,0.216443,184806
5,2,3,0.0,0.071934,0.0,0.0,0.107902,278031
6,3,1,0.0,0.0,0.0,0.0,0.239212,83608
7,3,2,0.0,0.057699,0.0,0.0,0.115397,173314
8,3,3,0.0,0.0,0.0,0.0,0.107821,278238
9,4,1,0.0,1.078389,0.0,0.0,0.490177,102004


In [80]:
wide['pi_period'] = wide['pi'].astype(str) + '_' + wide['period'] # UNIQUE ID FOR PI + PERIOD COMBINATION

col_lookup = (pi_date_color_counts_clover 
              .drop_duplicates(subset=['pi', 'period'])
              .set_index(['pi', 'period'])['color'])
wide['color'] = [col_lookup.loc[(pi, per)] for pi, per in zip(wide['pi'], wide['period'])]
wide

Unnamed: 0,pi,period,appositus,fervidus,griseocollis,nevadensis,vosnesenskii,n_images,pi_period,color
0,1,1,0.0,2.02826,0.112681,0.0,1.915579,88746,1_1,cnt
1,1,2,0.0,0.0,0.0,0.0,0.323257,216546,1_2,grn
2,1,3,0.036112,0.072225,0.0,0.0,0.216675,276913,1_3,blv
3,2,1,0.0,0.237806,0.118903,0.0,0.594516,84102,2_1,blv
4,2,2,0.0,0.0,0.054111,0.0,0.216443,184806,2_2,blv
5,2,3,0.0,0.071934,0.0,0.0,0.107902,278031,2_3,cnt
6,3,1,0.0,0.0,0.0,0.0,0.239212,83608,3_1,blu
7,3,2,0.0,0.057699,0.0,0.0,0.115397,173314,3_2,blu
8,3,3,0.0,0.0,0.0,0.0,0.107821,278238,3_3,grn
9,4,1,0.0,1.078389,0.0,0.0,0.490177,102004,4_1,grn


In [None]:
grouping = wide['color'].values
strata = wide['period'].values
data_mat = wide[species_cols].values
ids = wide['pi_period'].values

In [None]:
# EXPORT FOR USE IN R

wide_export = wide[['pi_period', 'color', 'period'] + species_cols]
wide_export.to_csv('csvs/permanova/species_matrix.csv', index=False)