In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px

## Modify Data

### Create Merged .csv File with Data from All Cities

In [None]:
# setup global variables - data file directory and name
DATA_FILES_DIR = 'data'
DATA_FILE_NAME = 'data.csv.gz'

all_data_files = os.listdir(DATA_FILES_DIR)

def create_data_file():
    # read each data original data file and concatanate it to single df
    os.chdir(DATA_FILES_DIR)
    df = pd.concat(map(pd.read_csv, all_data_files), ignore_index=True)
    os.chdir('..')   # return to previous dir - main dir
    
    # remove some patterns from city column    
    df['city'] = df['city'].str.replace(',Croatia', '')
    df['city'] = df['city'].str.replace(r'+', ' ')
    
    # sort data by datetime and city and save it to .csv file
    df = df.sort_values(by=['date_time', 'city'])
    df.to_csv(DATA_FILE_NAME, index=False, compression='gzip')
    print('Data processed successfully')

# create data file if does not exist
if not os.path.exists(DATA_FILE_NAME):
    print('Creating data file')
    create_data_file()
else:
    print('Data has already been processed')

## Import Data & Data Info

In [None]:
# import data 
df_data = pd.read_csv(DATA_FILE_NAME, compression='gzip')
df_data.info()

In [None]:
df_data.head(5)

# Correlation

In [None]:
# global variables
CORRELATION_DIR = 'correlation_plots'

# recreate directory if does not exist
if not os.path.exists(CORRELATION_DIR):
    print(f'Creating folder {CORRELATION_DIR}')
    os.mkdir(CORRELATION_DIR)

# to always have the newest plot versions, delete file before creating new one
def remove_file_if_exists(file_path):
    if os.path.exists(file_path):
        os.remove(file_path)

In [None]:
# function to calculate correlation matrix values
def create_correlation_matrix(data, towns, field):
    towns_cnt = len(towns)
    # init zero matrix with m=n=count of cities 
    # set values to -13, just to be sure it is an imposible correlation value
    ret_matrix = np.zeros((towns_cnt, towns_cnt)) - 13 

    # iterate through every city combination and calculate the correlation
    # normalize the date for each town
    for i, town1 in enumerate(towns):
        town1_values = np.array(data.loc[data['city'] == town1][field])
        town1_values = (town1_values - np.mean(town1_values)) / (np.std(town1_values) * len(town1_values))
        # correlation 1 on diagonal
        ret_matrix[i,i] = 1.0
        
        # having in mind that ret_matrix[i,j] == ret_matrix[j,i]
        for j, town2 in enumerate(towns[i+1:], i+1):
            town2_values = np.array(data.loc[data['city'] == town2][field])
            town2_values = (town2_values - np.mean(town2_values)) / (np.std(town2_values))
            ret_matrix[i,j] = np.correlate(town1_values, town2_values)[0]
            ret_matrix[j,i] = ret_matrix[i,j]
    
    return ret_matrix

In [None]:
unique_towns = sorted(list(df_data['city'].unique()))

CORRELATION_COLUMN = 'tempC'  # choose which column will be used for analysis
CORRELATION_DATA_FILENAME = f'{CORRELATION_COLUMN}_correlation_data.npy'

# check if we already have correlation matrix saved
if os.path.exists(CORRELATION_DATA_FILENAME):
    print('Correlation file exists!')
    corr_matrix = np.load(CORRELATION_DATA_FILENAME)
else:
    print('Correlation file does not exist.. Creating one...')
    corr_matrix = create_correlation_matrix(data=df_data, towns=unique_towns, field=CORRELATION_COLUMN)
    np.save(CORRELATION_DATA_FILENAME, corr_matrix)

In [None]:
# plot correlation matrix
CORRELATION_MATRIX_FILENAME = f'{CORRELATION_DIR}/{CORRELATION_COLUMN}_correlation_matrix.png'
remove_file_if_exists(CORRELATION_MATRIX_FILENAME)
fig = px.imshow(corr_matrix, x=unique_towns, y=unique_towns,
               width=1300, height=1300)
fig.update_layout(title_text='Correlation Matrix', title_x=0.5)
fig.write_image(CORRELATION_MATRIX_FILENAME)
fig.show()

In [None]:
# function to plot correlation bar chart
def plot_town_bar_chart(cor, towns, field):
    asix_range = np.arange(0, len(towns))
    for i, town in enumerate(towns):
        CORRELATION_IMAGE_FILENAME = f'{CORRELATION_DIR}/{field}_{town}_correlation_chart.png'
        remove_file_if_exists(CORRELATION_IMAGE_FILENAME)
        
        curr_towns = towns.copy()
        curr_towns.remove(town)
        
        curr_values = cor[i]
        curr_values = np.delete(curr_values, i)
        
        curr_df = pd.DataFrame({'CITY': curr_towns, 'VALUES': curr_values})
        fig = px.bar(curr_df, x='CITY', y='VALUES',
                    hover_name='CITY', width=1000, height=500)
        fig.update_layout(title_text=f'Correlation - {town}', title_x=0.5)
        fig.update_xaxes(tickangle=90, tickmode='linear', title='')
        fig.update_yaxes(title='%', range=[0.5,1.01])
        fig.write_image(CORRELATION_IMAGE_FILENAME)
        
        if town == 'Rijeka':
            fig.show()

In [None]:
# call function for creating bar charts for each town
plot_town_bar_chart(corr_matrix, unique_towns, CORRELATION_COLUMN)

## Correlation Map

In [None]:
def create_graph(corr_matrix, unique_towns, unique_towns_index_sorted):
    ret_graph = {}
    for i in unique_towns_index_sort:
        town1 = unique_towns[i]
        ret_graph[town1] = []
        # having in mind that ret_matrix[i,j] == ret_matrix[j,i]
        for j in unique_towns_index_sort[i+1:]:
            town2 = unique_towns[j]
            if corr_matrix[i][j] < 0.95:
                ret_graph[town1].append(town2)
    
    return ret_graph  

# plot map with values from SVD_V (towns to concept)
def plot_svd_map(unique_towns, vector, k, data_geo):
    CORR_MAP_FILENAME = f'{CORRELATION_DIR}/{CORRELATION_COLUMN}_correlation_map.png'
    remove_file_if_exists(CORR_MAP_FILENAME)
    
    data_geo['VALUES'] = vector
    px.set_mapbox_access_token(open(".mapbox_token").read())
    
    fig = px.scatter_mapbox(data_geo, lat="LAT", lon="LNG", 
                            color="VALUES", hover_name="CITY", 
                            color_continuous_scale=px.colors.cyclical.Phase)
    fig.write_image(CORR_MAP_FILENAME)
    fig.show()

In [None]:
# sort indexes of unique_towns based on lng, lat
GEO_POSITION_FILENAME = 'geo_position.csv'
df_geo_position = pd.read_csv(GEO_POSITION_FILENAME)
df_geo_position.sort_values(by=['LNG', 'LAT'], inplace=True)

unique_towns_index_sort = list(df_geo_position.index)
corr_graph = create_graph(corr_matrix, unique_towns, unique_towns_index_sort)


# Export to HTML

In [None]:
# save notebook before nbconvert
import IPython

In [None]:
%%javascript
IPython.notebook.save_notebook()

In [None]:
# export notebook results to HTML
!jupyter nbconvert --to=HTML correlation.ipynb