<a href="https://colab.research.google.com/github/miriammazzeo95/BigData_and_Timeseries_in_Pyspark/blob/main/DataAnonymization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **Set Up Pyspark, Imports and Functions**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# TO CHANGE CURRENT DIR
# %cd /content/drive/MyDrive/Colab\ Notebooks/Big\ Data\ with\ Pyspark

/content/drive/MyDrive/Colab Notebooks/Big Data with Pyspark


 Import configuration Colab Script

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# https://colab.research.google.com/drive/1YGZCoCGw632dFe_yxAViQYIsXnFKqEdA?usp=sharing
colb_script_id = '1YGZCoCGw632dFe_yxAViQYIsXnFKqEdA'
link_to_file_in_drive = drive.CreateFile({'id':colb_script_id})

link_to_file_in_drive.GetContentFile('Pyspark_ConfigurationImportsFunctions.ipynb') # creates a local copy in the VM
!jupyter nbconvert --to python 'Pyspark_ConfigurationImportsFunctions.ipynb' # converts the local copy from notebook to .py
!rm Pyspark_ConfigurationImportsFunctions.ipynb # deletes the local copy
import Pyspark_ConfigurationImportsFunctions as pyspark_config # imports everything from the script
dir(pyspark_config)

# **Anonymize data**

**Dictionary** 
> We want to aggregate data from multiple addresses on Commune level

> We thus need to map addresses to corresponding Power stations to aggregate data on those Power stations

> Going through all monthly meter-data, we create a dictionary [Adress --> StationID] of unique Address/ID couples


In [None]:
# WE CREATE A DICTIONARY [Adress --> StationID] STARTING READING FROM APRIL 2021
# THE FIRST STATION-ID COLUMN GET INTRODUCED IN MARCH THE 3rd 2021

# GET PATHS TO RAW DATA MONTH-FOLDERS IN THE YEAR 2020 IN HDFS DATA LAKE
month_paths = get_filePaths('hdfs dfs -ls /feddl/landing/ENERGYDATA/quarter_hourly/daily/year=2021/')

# READING 1 MONTH (APRIL 2021) OF RAW DATA IN HDFS DATA LAKE
df = spark.read.parquet(month_paths[3])
# SELECT COLUMNS
df = df.select('InstallationAdresse','PostNr', 'PowerStationID')
# MERGE COLUMNS AND DROP DUPLICATES
df_union = merge_cols(df, 'InstallationAdresse', 'PostNr', 'InstallationAdresse', sep=' ').dropDuplicates(['InstallationAdresse']) 

i=4
print(len(month_paths[i:]))

# WE READ ONE MONTH AT A TIME, SELECT ADDRESS+POST-NUM AND STATION-ID
# WE DROP DUPLICATES AND MERGE THE NEW COUPLE ADDRESS-ID TO THE DICTIONARY-DF INITIALISED ABOVE df_union
for path in month_paths[i:]:
    
    df = spark.read.parquet(path)
    df = df.select('InstallationAdresse','PostNr', 'PowerStationID')
    df = merge_cols(df, 'InstallationAdresse', 'PostNr', 'InstallationAdresse', sep=' ')
    df = df.dropDuplicates(['InstallationAdresse']) 
    df_union = df.unionAll(df).dropDuplicates(['InstallationAdresse'])
    
    print([i, path])
    i += 1


df_union.write.parquet('/feddl/work/miriam/ENERGYDATA/address_powerStation')

# IF THERE ARE NULL VALUES IN THE ADDRESSES COLUMN
# df_union = df_union.where(col('InstallationAdresse').isNotNull() & (col('InstallationAdresse')!='\?'))

# IF THERE ARE NULL VALUES IN THE STATION-ID COLUMN -> FORWARD FILL
  
# TO COUNT THE TOTAL ADDRESSES-ID COUPLE FOUND
# df_union.count() # 139936
# spark.read.parquet('/feddl/work/miriam/ENERGYDATA/address_powerStation').count() # 139936

# TAKEN AROUND 10 MIN



**Anonymization**
> In order to display data we need to use Anonymization techniques

> Using the created dictionary Adress->InstallationID, we map a column in all the historical data containing power-station-IDs 

> The adress column and the dictionary gets deleted to be GDPR complient

> Future step would be remapping those IDs to random generated numbers



In [None]:
# SOME USEFUL FINCTIONS

# CONVERT SPARK DF OF 2 COLUMNS TO DICTIONARY
def df_to_dict(path):
    # READ DF-DICTIONARY (SAVED PREVIOUSLY) IN 'feddl/work/miriam/ENERGYDATA/address_powerStation/'
    df_dict = spark.read.parquet(path)
    #df_dict.explain(True)
    # CONVERT STRING COLUMN TO INTEGER
    df_dict = df_dict.withColumn("PowerStationID", col('PowerStationID').cast('int'))
    # SORTING ALPHABETICALLY
    df_dict = df_dict.sort(col("InstallationAdresse").asc())
    # CONVERT TO DICTIONARY 
    dictionary = {row['InstallationAdresse']:row['PowerStationID'] for row in df_dict.collect()}
    return dictionary

# ADD A COLUMN WITH THE SAME VALUE
def add_commodity(df, t: str):
    df = df.withColumn('Commodity', lit(t))
    return df

# CONVERT DATE AND TIME COLUMNS TO 1 TIMESTAMP COLUMN
def get_timestamp(df):
    df = merge_cols(df, 'Dato', 'Hour', 'TimeStamp', sep=' ')
    df = df.withColumn('TimeStamp', to_timestamp(col("TimeStamp"), 'yyyy-MM-dd HH:mm:ss'))
    return df

In [None]:
address_dictionary = df_to_dict('/feddl/work/miriam/ENERGYDATA/address_kabelskab')

cleaning_monthsPaths = get_filePaths( 'hdfs dfs -ls /feddl/work/miriam/ENERGYDATA/1_cleaning/*/*/')
saving_folder = '/feddl/work/miriam/ENERGYDATA/2_anonymization'
test_folder = '/feddl/work/miriam/test/ENERGYDATA/2_anonymization'

num_paths = len(cleaning_monthsPaths)
#print(cleaning_monthsPaths)

for p in range(0,num_paths):
    
    # GET MONTH_PATH
    path = cleaning_monthsPaths[p]
    print(path)
    
    # READ FILE
    df = spark.read.parquet(path)
    
    # MAP 1 COLUMN PowerStationID (MISSING IN DATA BEFORE MARCH 2020)
    df = df_mapCol(df, address_dictionary, 'PowerStationID', 'AdDress')
    
    # GET TIMESTAMP COLUMN
    df = get_timestamp(df)
    df = df.withColumn('day', col('day').cast(StringType()))
    
    # AGGREGATION OF Value ON PowerStationID - ANONIMIZATION
    df = df.groupBy('TimeStamp', 'day', 'Measurement', 'PowerStationID').sum()
    df = df.withColumnRenamed("sum(Value)","Value")
    
    # ADD DAY COLUMN
    #df = df.withColumn('day', dayofmonth('Dato'))
    
    # GET COMMODITY TYPE: electricity, heat, water
    df = df.withColumn('Commodity', when((col('Measurement') == 'EL_A_kWh') | (col('Measurement') == 'EL_E_kWh') | (col('Measurement') == 'P_A_kWh') | (col('Measurement') == 'P_E_KWh'), 'electricity')
        .when( (col('Measurement')== 'EL_Va_m3') | (col('Measurement') == 'W_M2v2_m3') | (col('Measurement') == 'W_M2v1_m3') | (col('Measurement') == 'W_cM2v1_m3') | (col('Measurement') == 'W_cM2v2_m3'), 'water')
        .when( (col('Measurement')== 'EL_Fv_m3') | (col('Measurement') == 'H_M1v4_C') | (col('Measurement') == 'H_M3v4_m3'), 'heat'))

    # DROP ROWS WHERE PowerStationID IS EMPTY
    df = df.where((col('PowerStationID') != '?') & (col('PowerStationID').isNotNull()))
    
    # CREATE SAVING PATH
    chrono_folder = path.split('g')[-1]
    saving_path = saving_folder + chrono_folder
    
    # SAVE MONTH BY DATE
    df.write.parquet(saving_path, partitionBy='day')
    
    # DEBUGGING
    print([p,'SAVED', saving_path])

  