# Merge the Energy Label info with the Solar Potential 

In [1]:
from utils import *

Define the location for:
- the directory in which we have the solar potential info
- the filename in which we have the energy label classification info
- the subdir in which we will save the combined information
- the base we will use for creating the filenames of the files in which we will store the combined info

In [2]:
data_dir = os.path.join(os.getcwd(), 'data')
solar_data_subdir = 'solar_ll'
energy_cls_filename = 'energy_classes_clean.csv'
combined_data_subdir = 'out'
out_base_filename = 'combined_solar_enclass'

Create the paths

In [3]:
solar_dir = os.path.join(data_dir, solar_data_subdir)
energy_cls_f = os.path.join(data_dir, energy_cls_filename)
combined_out_dir = os.path.join(data_dir, combined_data_subdir)

In [4]:
solar_f = [os.path.join(solar_dir, f) for f in os.listdir(solar_dir) if os.path.isfile(os.path.join(solar_dir, f))]
solar_f = sorted(solar_f)

print('We have {} files'.format(len(solar_f)))

We have 33 files


#### Find the closest location match between the solar potential info and energy label classification info

__Approach__: based on [SciPy's cKDTree](https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.cKDTree.html) on which I stumbled by reviewing the wonderful work presented in this Medium post: [Teaching a neural network to see roads
](https://towardsdatascience.com/teaching-a-neural-network-to-see-roads-74bff240c3e5) by Laura Lewis.

What cKDTree does in a nutshell is to find the closest neighbour in dataset A taking dataset B as a basis (you specify through a parameter k for how many closest neighbours to look for for each point in B -- k=1 --> closest neighbour). 

To apply this approach to our data we need to do some corrections such that we have 2 datasets (numpy arrays) with point coordinates. The point coordinates should be the ones describing the Latitude and Longitude from the Energy Label Classification dataset and a similar point (centroid - we need to calculate it) for the polygons we have in the Solar Potential dataset.

The corrections we need to make are:
- Energy Label Classification: reverse the order of the original coordinates from (Long, Lat) to (Lat, Long)
- Solar Potential:
    - correct the polygon information such that it conforms to the Linear Ring that Shapely expects when attempting to compute the centroid of a polygon
    - calculate the centroid

In [5]:
pool = mp.Pool(mp.cpu_count())
print(pool)

results = pool.map(combine_solar_w_energyclass, \
                   [(solar_f[i], energy_cls_f, combined_out_dir) for i in range(len(solar_f))])

2019-08-30 05:00:13.966657
2019-08-30 05:00:13.966816
2019-08-30 05:00:13.967004
2019-08-30 05:00:13.967978
<multiprocessing.pool.Pool object at 0x7f50a8a0c6a0>
[2019-08-30 05:02:03.475906] Done merging solar and energy class.
[2019-08-30 05:02:04.734152] Done merging solar and energy class.
[2019-08-30 05:02:07.635875] Done merging solar and energy class.
Index(['pv_prop', 'geometry_sol', 'distance', 'en_prop', 'geometry'], dtype='object')
[2019-08-30 05:02:35.387157] Done merging solar and energy class.
Index(['pv_prop', 'geometry_sol', 'distance', 'en_prop', 'geometry'], dtype='object')
Index(['pv_prop', 'geometry_sol', 'distance', 'en_prop', 'geometry'], dtype='object')
Index(['pv_prop', 'geometry_sol', 'distance', 'en_prop', 'geometry'], dtype='object')
Index(['pv_prop', 'geometry_sol', 'distance', 'en_prop', 'geometry'], dtype='object')
Selecting the pair with the minimum distance.         Current total rows: 399997
Index(['pv_prop', 'geometry_sol', 'distance', 'en_prop', 'geomet

#### Refining the closest match

Get the closest solar potential characteristic for each group with the same Address based on the minimm distance.

Back up the results to a file for the actual analysis.

In [None]:
mydir = os.path.join(os.getcwd(), 'data', 'out')
files = [os.path.join(mydir, f) for f in os.listdir(mydir) if os.path.isfile(os.path.join(mydir, f))]
files = sorted(files)
print('We have {} files'.format(len(files)))

for i in range(len(files)):
    df = pd.read_csv(files[i], sep=';')
    gr_min_dist = df.groupby(['geometry']).agg({'distance':'min'})
    gr_min_dist.reset_index(inplace=True)
    gr_best_match = pd.merge(gr_min_dist, df, on=gr_min_dist.columns.tolist(), how='left')
    filename = files[i].replace('/out/', '/out_clean/')
    gr_best_match.to_csv(filename, sep=';', index=False)
    print('[{}] {}'.format(datetime.now(), filename))

mydir = os.path.join(os.getcwd(), 'data', 'out_clean')
files = [os.path.join(mydir, f) for f in os.listdir(mydir) if os.path.isfile(os.path.join(mydir, f))]

dfs = [pd.read_csv(f, sep=';') for f in files]

consolidated_df = pd.concat(dfs)
consolidated_df.drop_duplicates(inplace=True)
consolidated_df.to_csv('./data/out_clean/consolidated_enclass_solar.csv', sep=';')