In [None]:
# Import libraries and modules
import time
from tqdm import tqdm

import pandas as pd

import numpy as np
np.random.seed(123)  # for reproducibility

# Functions
def check_empty_array(array):
    if((array.shape[0] == 0) & (array.shape[1] != 0)):
        return np.zeros((1,array.shape[1]))
    else:
        return array

def select_array(array):
    # -1 due to one day shift
    return check_empty_array(np.array(array))[0][4:-1]

def create_history_arrays(data, x, y , raster, dist):
    # length - 4 info columns + 1 column one day shift
    length = data.shape[1]-5
    
    dim = 2*dist+1
    result_data = np.zeros((length, dim, dim)).astype(int)
    for dy in range(-dist, dist+1):
        for dx in range(-dist, dist+1):
            result_data[:,dx+dist, dy+dist] = select_array(dataframe.loc[(dataframe['x'] == x+dx*raster) & (dataframe['y'] == y+dy*raster)])
    return result_data.reshape(result_data.shape[0], result_data.shape[1] * result_data.shape[1]);

def create_neighbours_images_array_pd(dataframe, raster, dist):
    rows = dataframe.values.shape[0]
    max_y = max(dataframe.iloc[:,4:].max())
    counter = 0
    tmp_result_data = pd.DataFrame()
    result_data = pd.DataFrame()
    for i in tqdm(range(rows)):
        praha = dataframe.iloc[i,3]
        if(praha == 1):
            xy = dataframe.iloc[i,(1,2)]
            neig_arrays = pd.DataFrame(create_history_arrays(dataframe, xy[0], xy[1], raster, dist))
            neig_arrays /= max_y
            #  4 info columns + 1 column one day shift = dataframe.iloc[i,5:]
            neig_arrays['id'] = dataframe.iloc[i,0]
            neig_arrays['x'] = xy[0]
            neig_arrays['y'] = xy[1]
            neig_arrays['date'] = dataframe.columns.values[5:]
            neig_arrays['crimecount'] = dataframe.iloc[i,5:].values
            tmp_result_data = pd.concat([tmp_result_data, neig_arrays])
            if(counter % 200 == 0):
                result_data = pd.concat([result_data, tmp_result_data])
                tmp_result_data = pd.DataFrame()
            counter += 1
            
    
    if tmp_result_data.shape[0] > 0:
        result_data = pd.concat([result_data, tmp_result_data])
    print("praha == 1: %d" % counter)
    return result_data

def create_neighbours_images_array_np(dataframe, raster, dist):
    rows = dataframe.values.shape[0]
    max_y = max(dataframe.iloc[:,4:].max())
    counter = 0
    for i in tqdm(range(rows)):
        praha = dataframe.iloc[i,3]
        if(praha == 1):
            xy = dataframe.iloc[i,(1,2)]
            neig_arrays = create_history_arrays(dataframe, xy[0], xy[1], raster, dist)
            neig_arrays /= max_y
            data_ids = np.array([dataframe.iloc[i,0], xy[0], xy[1]] * neig_arrays.shape[0]).reshape(neig_arrays.shape[0], 3)
            data_ids = np.column_stack((data_ids, dataframe.columns.values[4:]))
            #  4 info columns + 1 column one day shift = dataframe.iloc[i,5:]
            data_ids = np.column_stack((data_ids, dataframe.iloc[i,5:].values))
            neig_arrays = np.column_stack((data_ids, neig_arrays))
            try:
                tmp_result_data = np.concatenate((tmp_result_data, neig_arrays), axis=0)
            except NameError:
                tmp_result_data = neig_arrays
            if((counter+1) % 100 == 0):
                try:
                    result_data = np.concatenate((result_data, tmp_result_data), axis=0)
                except NameError:
                    result_data = tmp_result_data
                del tmp_result_data
            counter += 1
    try:
        tmp_result_data
        result_data = np.concatenate((result_data, tmp_result_data), axis=0)
    except NameError:
        print("del")
    print("praha == 1: %d" % counter)
    return result_data


In [None]:
# Load data
raster = 200
crime_type = "E05"
date_from = "2013-06-16"
date_to = "2017-03-08"

# insert main dir
main_dir = ""

data_dir = "%s/data/%s/timeseries_%d_%s_%s_%s_select.csv" % (main_dir, crime_type, raster,crime_type, date_from, date_to)
dataframe = pd.read_csv(data_dir)

dataframe.head()

In [None]:
# Preprocess and save data new format, sorted
for dist in range(5,6):
    print(dist)
    result_data = create_neighbours_images_array_pd(dataframe, raster, dist)
    
    columns_i = [i for i in range(121)]
    columns_i.append('crimecount')
    result_data.sort_values(columns_i, inplace=True)
    
    save_dir = "%s/output/python/neighbour_arrays_%d_%s_%s_%s_x_%d_s_select.csv" % (main_dir, raster,crime_type, date_from, date_to, dist)
    result_data.to_csv(save_dir, sep=",", index = False) 
    
    save_dir = "%s/output/python/neighbour_arrays_%d_%s_%s_%s_x_%d_np_s_select.npy" % (main_dir, raster,crime_type, date_from, date_to, dist)
    np.save(save_dir, result_data.as_matrix())
    
    print("end")
    del result_data

In [None]:
# Preprocess and save data old format 3D
for dist in range(5,6):
    print(dist)
    result_data = create_neighbours_images_array_np(dataframe, raster, dist)
    
    columns_i = [str(x) for x in [i for i in range(121)]]
    columns_i.append('crimecount')
    
    save_dir = "/home/mori/Documents/4_semestr/DIP/output/python/neighbour_arrays_%d_%s_%s_%s_x_%d_np.npy" % (raster,crime_type, date_from, date_to, dist)
    np.save(save_dir, result_data)
    
    del result_data