In [1]:
import numpy
import pandas

In [12]:
# interpolated file list
lasFile_intrpl = [
    'MSU_2019_07_02_interpolated.csv.gz',
    'MSU_2019_07_15_interpolated.csv.gz',
    'MSU_2019_07_28_interpolated.csv.gz',
    'MSU_2019_08_12_interpolated.csv.gz',
    'MSU_2019_09_02_interpolated.csv.gz',
    'MSU_2019_09_11_interpolated.csv.gz',
    'MSU_2019_09_24_interpolated.csv.gz',
    'MSU_2019_10_07_interpolated.csv.gz'
]

# raw file list
lasFile_raw = [
    'MSU_2019_07_02_NatCol_obs_plots.csv.gz',
    'MSU_2019_07_15_NatCol_obs_plots.csv.gz',
    'MSU_2019_07_28_NatCol_obs_plots.csv.gz',
    'MSU_2019_08_12_NatCol_obs_plots.csv.gz',
    'MSU_2019_09_02_NatCol_obs_plots.csv.gz',
    'MSU_2019_09_11_NatCol_obs_plots.csv.gz',
    'MSU_2019_09_24_NatCol_obs_plots.csv.gz',
    'MSU_2019_10_07_NatCol_obs_plots.csv.gz'
]

# days after planting
dap_list = [36, 49, 62, 77, 98, 107, 120, 133]

# empty list to contain dataframes
df_list = []

# for each file pair
for intrplFile, rawFile, dap in zip(lasFile_intrpl, lasFile_raw, dap_list):
    # load raw file
    raw_df = pandas.read_csv(rawFile)
    raw_df = raw_df.dropna()
    
    print("Loaded %s" % rawFile)
    
    # make empty list for storing shpID names
    index_list = []
    max_list = []
    min_list = []
    for name, group in raw_df.groupby("shpID"):
        index_list.append(name)
        max_list.append(group.max())
        min_list.append(group.min())
    
    # delete raw file for memory efficiency
    del raw_df
    
    # make max, min DataFrame
    max_df = pandas.DataFrame(max_list, index = index_list)
    min_df = pandas.DataFrame(min_list, index = index_list)
    
    # load interpolated file
    intrpl_df = pandas.read_csv(intrplFile)
    
    print("Loaded %s" % intrplFile)

    # add column for days after pollination (dap)
    intrpl_df['dap'] = float(dap)

    for name, group in intrpl_df.groupby("shpID"):
        for col in ['z_position', 'r_record', 'g_record', 'b_record']:
            ulimit = max_df.loc[name,col]
            llimit = min_df.loc[name,col]
            
            mask = group[col] > ulimit
            group.loc[mask,col] = ulimit
            
            mask = group[col] < llimit
            group.loc[mask,col] = llimit
        # append modified group
        df_list.append(group)
    
    # constrain values based on max, min of original data
#     for index, row in intrpl_df.iterrows():
#         name = row.get('shpID')
#         for col in ['z_position', 'r_record', 'g_record', 'b_record']:
#             ulimit = max_df.loc[name,col]
#             llimit = min_df.loc[name,col]
            
#             if row.get(col) > ulimit:
#                 intrpl_df.loc[index,col] = ulimit
            
#             if row.get(col) < llimit:
#                 intrpl_df.loc[index,col] = llimit
    
    
#     df_list.append(intrpl_df)

print("Merging datasets")
all_df = pandas.concat(df_list)

Loaded MSU_2019_07_02_NatCol_obs_plots.csv.gz
Loaded MSU_2019_07_02_interpolated.csv.gz
Loaded MSU_2019_07_15_NatCol_obs_plots.csv.gz
Loaded MSU_2019_07_15_interpolated.csv.gz
Loaded MSU_2019_07_28_NatCol_obs_plots.csv.gz
Loaded MSU_2019_07_28_interpolated.csv.gz
Loaded MSU_2019_08_12_NatCol_obs_plots.csv.gz
Loaded MSU_2019_08_12_interpolated.csv.gz
Loaded MSU_2019_09_02_NatCol_obs_plots.csv.gz
Loaded MSU_2019_09_02_interpolated.csv.gz
Loaded MSU_2019_09_11_NatCol_obs_plots.csv.gz
Loaded MSU_2019_09_11_interpolated.csv.gz
Loaded MSU_2019_09_24_NatCol_obs_plots.csv.gz
Loaded MSU_2019_09_24_interpolated.csv.gz
Loaded MSU_2019_10_07_NatCol_obs_plots.csv.gz
Loaded MSU_2019_10_07_interpolated.csv.gz
Merging datasets


In [14]:
all_df.max()

x_position                3
y_position             2.44
z_position          769.156
r_record              65280
g_record              65280
b_record              65280
shpID         EL19_fill-9-2
shpIX                  1680
dap                     133
dtype: object

In [15]:
all_df.min()

x_position                -3
y_position             -2.44
z_position           716.538
r_record                   0
g_record                   0
b_record                   0
shpID         EL19_5098-1-19
shpIX                      0
dap                       36
dtype: object

In [16]:
all_df.to_csv("all_interpolated.csv.gz", index=None)

In [17]:
all_df.head()

Unnamed: 0,x_position,y_position,z_position,r_record,g_record,b_record,shpID,shpIX,dap
17784,-3.0,-2.44,754.300528,35025.162381,33504.094577,33282.496769,EL19_5098-1-19,18,36.0
17785,-2.837838,-2.44,754.333563,32327.390835,31561.644047,30923.573239,EL19_5098-1-19,18,36.0
17786,-2.675676,-2.44,754.368385,43458.396474,42268.66971,42079.945688,EL19_5098-1-19,18,36.0
17787,-2.513514,-2.44,754.403207,43922.821752,43229.006309,42873.437901,EL19_5098-1-19,18,36.0
17788,-2.351351,-2.44,754.438029,45340.60029,44765.812362,45396.340861,EL19_5098-1-19,18,36.0


In [26]:
all_df.sort_values(by=['shpID','dap','y_position','x_position'], inplace=True)

In [35]:
all_df.iloc[7900:7910,:]

Unnamed: 0,x_position,y_position,z_position,r_record,g_record,b_record,shpID,shpIX,dap
18768,2.513514,2.44,754.044,18628.977637,18100.135619,16041.128188,EL19_5098-1-19,18,133.0
18769,2.675676,2.44,760.488,20497.58166,18739.67313,18231.894911,EL19_5098-1-19,18,133.0
18770,2.837838,2.44,754.044,19812.64451,17834.040049,17159.678496,EL19_5098-1-19,18,133.0
18771,3.0,2.44,754.044,19160.463549,16971.717642,16138.739375,EL19_5098-1-19,18,133.0
18772,-3.0,-2.44,754.215858,30887.587953,29189.442996,28712.482903,EL19_5099-1-20,19,36.0
18773,-2.837838,-2.44,754.241521,38516.37192,37303.917638,36817.251933,EL19_5099-1-20,19,36.0
18774,-2.675676,-2.44,754.268116,44392.418124,43248.595611,42907.850841,EL19_5099-1-20,19,36.0
18775,-2.513514,-2.44,754.352053,43337.359841,42653.434343,41660.764282,EL19_5099-1-20,19,36.0
18776,-2.351351,-2.44,754.097663,43159.679983,42557.883023,41090.625305,EL19_5099-1-20,19,36.0
18777,-2.189189,-2.44,754.284473,26154.140162,42993.164552,9728.0,EL19_5099-1-20,19,36.0


In [19]:
all_mat = numpy.empty((len(all_df), 6), dtype = 'float64')
all_mat[:,0] = all_df['x_position'].values
all_mat[:,1] = all_df['y_position'].values
all_mat[:,2] = all_df['z_position'].values
all_mat[:,3] = all_df['g_record'].values - all_df['r_record'].values
all_mat[:,4] = all_df['b_record'].values
all_mat[:,5] = all_df['dap'].values

In [20]:
from sklearn.decomposition import PCA

In [21]:
pca = PCA(n_components=3)
pca.fit(all_mat)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [24]:
print(pca.explained_variance_ratio_)
print(pca.components_)

[5.70245668e-01 4.29750659e-01 3.62952193e-06]
[[ 4.72825970e-07  5.24685826e-06 -2.98098286e-05 -2.28866390e-01
   9.73457701e-01 -5.28276845e-04]
 [-1.10137155e-07 -3.95171172e-07 -1.41857149e-05 -9.73457787e-01
  -2.28866156e-01  4.69000688e-04]
 [ 4.54010957e-05  4.56224457e-04  1.76899476e-02  3.35230483e-04
   6.21949882e-04  9.99843166e-01]]


In [25]:
pca.


6