In [4]:
import pandas as pd

import os, sys

def test_dir(dir):
    if not(os.path.isdir(dir)):
        os.mkdir(dir)
    return os.path.isdir(dir)

In [2]:
os.listdir('..//data//raw')

['.gitkeep', 'noaa_global', 'tech_challenge', 'temp_change', 'wbpy']

## Basic data from tech challenge

- [Dados da Vitivinicultura](http://vitibrasil.cnpuv.embrapa.br/index.php?opcao=opt_01) was indicated as main source of data;

### raw > interim

In [3]:
# Ajustando dataframe das exportações

#0. Create folder
test_dir('..//data//interim//tech_challenge//')

df_exp = pd.read_csv('..//data//raw//tech_challenge//ExpVinho.csv', sep=';')
df_exp = df_exp.melt(id_vars=['Id','País'])

vars = df_exp.variable.unique()
valor = [x for x in vars if x.endswith('.1')]
quantidade = list(set(vars) - set(valor))

df_quant = df_exp.loc[df_exp['variable'].isin(quantidade)]\
    .rename(columns={
          'value':'quantidade_exportada_pais'
        , 'variable':'ano'
        , 'País':'pais'
        , 'Id':'id'
        })

df_quant['ano'] = df_quant['ano'].astype(int)

df_value = df_exp.loc[df_exp['variable'].isin(valor)]\
    .rename(columns={
          'value':'valor_exportado_pais'
        , 'variable':'ano'
        , 'País':'pais'
        , 'Id':'id'
        })
df_value['ano'] = [int(x.replace('.1', '')) for x in df_value['ano']]

df_exp = pd.merge(df_quant, df_value, on=['id','pais','ano'])[['id','pais','ano','quantidade_exportada_pais','valor_exportado_pais']]\
    .drop(columns=['id'])


df_exp.to_csv('..//data//interim//tech_challenge//exportacao_vinhos.csv',index=False, sep=';', decimal=',') # export data to share with the project group members


In [4]:
# Ajustando df das comercializações no RS

#0. Create folder
test_dir('..//data//interim//tech_challenge//')

df_com = pd.read_csv('..//data//raw//tech_challenge//Comercio.csv', sep=';', header=None)
lista_anos = list(df_exp['ano'].unique())
df_com.columns = ['id','id_produto','produto'] + lista_anos

df_com = df_com\
    .melt(id_vars=['id','id_produto','produto'])\
    .rename(columns={
          'variable':'ano'
        , 'value':'quantidade_com_rs'
    })\
    .drop(columns=['id'])

df_com['id_produto'] = [str(x).strip().lower() for x in df_com['id_produto']]
df_com['produto'] = [str(x).strip().lower() for x in df_com['produto']]
df_com['ano'] = df_com['ano'].astype(int)

df_com.to_csv('..//data//interim//tech_challenge//comercio_vinhos_rs.csv',index=False, sep=';', decimal=',') # export data to share with the project group members

In [5]:
# Ajustando df das produções no RS

#0. Create folder
test_dir('..//data//interim//tech_challenge//')

df_prod = pd.read_csv('..//data//raw//tech_challenge//Producao.csv', sep=';', header=None)
lista_anos = list(df_exp['ano'].unique())
df_prod.columns = ['id','id_produto','produto'] + lista_anos

df_prod = df_prod\
    .melt(id_vars=['id','id_produto','produto'])\
    .rename(columns={
          'variable':'ano'
        , 'value':'quantidade_prod_rs'
    })\
    .drop(columns=['id'])

df_prod['id_produto'] = [str(x).strip().lower() for x in df_prod['id_produto']]
df_prod['produto'] = [str(x).strip().lower() for x in df_prod['produto']]
df_prod['ano'] = df_prod['ano'].astype(int)

df_prod.to_csv('..//data//interim//tech_challenge//producao_vinhos_rs.csv',index=False, sep=';', decimal=',') # export data to share with the project group members

### interim > processed

In [6]:
# interim to processed

#0. Create folder
test_dir('..//data//processed//tech_challenge//')

df_exp = pd.read_csv('..//data//interim//tech_challenge//exportacao_vinhos.csv', sep=';', decimal=',')
df_prod = pd.read_csv('..//data//interim//tech_challenge//producao_vinhos_rs.csv', sep=';', decimal=',')
df_com = pd.read_csv('..//data//interim//tech_challenge//comercio_vinhos_rs.csv', sep=';', decimal=',')

df_prod_com = df_prod.merge(df_com, on=['id_produto','produto','ano'], how='outer')

df_final = df_exp.merge(df_prod_com, on='ano', how='outer')

df_final.to_csv('..//data//processed//tech_challenge//df_vinhos.csv',index=False, sep=';', decimal=',') # export data to share with the project group members

## [Temperature Change over years](https://www.kaggle.com/datasets/sevgisarac/temperature-change)

### raw > processed


In [7]:
# Ajustar output da mudança de temperatura

#0. Create folder
test_dir('..//data//processed//temp_change//')

df = pd.read_csv("..//data//raw//temp_change//Environment_Temperature_change_E_All_Data_NOFLAG.csv", encoding='latin-1') # csv file is encoding as latin-1 type
df_countrycode=pd.read_csv('..//data//raw//temp_change//FAOSTAT_data_11-24-2020.csv') #this csv file includes ISO-3 Country Code, this mentioned in Data Wrangling 

#1. Renaming
df.rename(columns = {'Area':'Country Name'},inplace = True)
df.set_index('Months', inplace=True)
df.rename({'Dec\x96Jan\x96Feb': 'Winter', 'Mar\x96Apr\x96May': 'Spring', 'Jun\x96Jul\x96Aug':'Summer','Sep\x96Oct\x96Nov':'Fall'}, axis='index',inplace = True)
df.reset_index(inplace = True)

#2. Filtering 
df = df[df['Element'] == 'Temperature change']

#2. Drop unwanted columns from df_countrycode
df_countrycode.drop(['Country Code','M49 Code','ISO2 Code','Start Year','End Year'],axis=1,inplace=True)
df_countrycode.rename(columns = {'Country':'Country Name','ISO3 Code':'Country Code'},inplace=True)

#3. Merging with df to df_country
df = pd.merge(df, df_countrycode, how='outer', on='Country Name')

#2. Drop unwanted columns
df.drop(['Area Code','Months Code','Element Code','Element','Unit'],axis=1,inplace=True)

#3.Channing dataframe organization
df = df.melt(id_vars=["Country Code", "Country Name","Months",], var_name="year", value_name="tem_change")
df["year"] = [i.split("Y")[-1] for i in df.year]

df = df[df['Months']=='Meteorological year']# chose just year data
df.drop(['Months'],axis=1,inplace=True) # dropped Months column
df.to_csv('..//data//processed//temp_change//temperature_change_Data.csv',index=False, sep=';', decimal=',') # export data to share with the project group members

# [NOAA global data](https://www.kaggle.com/datasets/noaa/noaa-global-historical-climatology-network-daily)


### raw > interim

In [1]:
import numpy as np
import time
import os
import datetime
import pandas as pd
import multiprocessing as mp
import time

base_path = '..\\data\\raw\\noaa_global'
dest_path = '..\\data\\interim\\noaa_global'

In [9]:
test_dir(dest_path)

True

In [10]:
def process_year(file_year='2019'):
    base_path = '..\\data\\raw\\noaa_global'
    dest_path = '..\\data\\interim\\noaa_global'
    if not os.path.exists(f"{dest_path}\\{file_year}.csv"):

        start = datetime.datetime.now()

        raw_df = pd.read_csv(f"{base_path}\\ghcnd_all_years\\{file_year}.csv.gz",
                        usecols=[0,1,2,3], 
                        names=['station_id','date', 'stat', 'value'], 
                        dtype= {
                              'station_id' : str
                            , 'date': str
                            , 'stat': str
                            , 'value': np.int16
                            },
                        engine='c'
                        )
        raw_df['year'] = np.int16(file_year)
        grouped = raw_df.groupby(["station_id", 'year',"stat"]).mean().reset_index()
        grouped = grouped[['year', 'station_id', 'stat', 'value']]
        #return grouped
        grouped.to_csv(f"{dest_path}\\{file_year}.csv", index=False)
        duration = (datetime.datetime.now() - start).seconds
        print(f"{file_year} took {round(duration/60, 2)} minutes.")

In [11]:
years = [year[:4] for year in os.listdir(f"{base_path}\\ghcnd_all_years\\")]
years.sort()
level_of_parallelism = mp.cpu_count()
pool = mp.Pool(level_of_parallelism)
pool.map(process_year, years)

In [None]:
df_stations = pd.read_csv(base_path+'\\ghcnd-stations.txt', sep=';', decimal='.')
for col in ['id','name']:
    df_stations[col] = [x.strip() for x in df_stations[col]]
df_stations.to_csv(dest_path+'\\df_stations.csv',index=False, sep=';', decimal=',')

### interim > processed

In [2]:
import pandas as pd
import numpy as np
import os

In [6]:
base_path = '..\\data\\interim\\noaa_global\\'
dest_path = '..\\data\\processed\\noaa_global\\'

In [4]:
files = [base_path + x for x in os.listdir(base_path)]

In [5]:
df_noaa = pd.read_csv(base_path+"*",
                      dtype= {
                              'station_id' : str
                            , 'date': str
                            , 'stat': str
                            , 'value_min' : np.float64
                            , 'value_mean' : np.float64
                            , 'value_median' : np.float64
                            , 'value_max' : np.float64
                            },
                        engine='c'
                      )

OSError: [Errno 22] Invalid argument: '..\\data\\interim\\noaa_global\\*'

In [11]:
df_noaa

Unnamed: 0,year,station_id,stat,value
0,1763,ITE00100554,TMAX,147.873973
1,1763,ITE00100554,TMIN,100.657534


### interim > processed

In [7]:
import os, glob

base_path = '..\\data\\interim\\noaa_global\\'

def read_csv(args):
    return pd.read_csv(args, sep=';', decimal='.')

df_noaa = pd.concat(map(read_csv, glob.glob(base_path+'years\\*.csv')))

df_stations = pd.read_csv(base_path+'df_stations.csv', sep=';', decimal=',').rename(columns={'id':'station_id'})

df_noaa = df_noaa.merge(df_stations)

In [8]:
dest_path = '..\\data\\processed\\noaa_global\\'
df_noaa.to_csv(dest_path+'noaa_global.csv', index=False, sep=';', decimal=',')

# [WBPY Data](https://pypi.org/project/wbpy/)

- Was made from https://documents.worldbank.org/en/publication/documents-reports/api
- Indicators in http://api.worldbank.org/v2/indicator

### raw > processed

In [5]:
import os, glob
import pandas as pd

base_path = '..\\data\\raw\\wbpy\\'
dest_path = '..\\data\\processed\\wbpy\\'
test_dir(dest_path)
def read_csv(args):
    df = pd.read_csv(args, sep=',', decimal='.')
    df['metric'] = args.split('\\')[-1].split('.')[0]
    return df

df_wbpy = pd.concat(map(read_csv, glob.glob(base_path+'*.csv'))).rename(columns={'Unnamed: 0':'year'})
df_wbpy = df_wbpy.melt(id_vars=['year','metric']).rename(columns={'variable':'country'})
df_wbpy.to_csv(dest_path+'wbpy.csv', index=False, sep=';', decimal=',')

### interim > processed

In [3]:


import os
for root, dirs, files in os.walk('..\\data\\raw', topdown=False):
    for name in files:
        if not ('ghcnd-stations.txt' in name):
            os.remove(os.path.join(root, name))
    for name in dirs:
        try:
            os.rmdir(os.path.join(root, name))
        except:
            pass

# Inmet = RS

In [41]:
import os
import pandas as pd
import numpy as np
import joblib
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

base_path = r'..\\data\\processed'

df_clima_rs =  pd.read_csv(r'..\\data\\processed\\inmet\\rs.csv', sep=';', decimal=',')
df_clima_rs['data'] = pd.to_datetime(df_clima_rs['data'], format='mixed')

In [42]:
# -9999.0 was used as a replace for nan values, so we put nan in it again:
df_clima_rs['temp_min'] = df_clima_rs['temp_min'].replace(-9999.0, np.nan)
df_clima_rs['temp_max'] = df_clima_rs['temp_max'].replace(-9999.0, np.nan)
df_clima_rs['prec'] = df_clima_rs['prec'].replace(-9999.0, np.nan)

# first, calculate daily precipitation rating, and max-min of temp:
df_clima_rs = df_clima_rs.groupby('data').agg({
    'prec':'sum',
    'temp_max':('mean','max'),
    'temp_min':'min'
}).reset_index()

df_clima_rs.columns = ['data','PRCP','TAVG','TMAX','TMIN']

# now, we calculate the same metrics per year, recovering the mean for each variable in the year (same as noaa data):

df_clima_rs['year'] = df_clima_rs.data.dt.year

df_clima_rs = df_clima_rs.groupby('year').agg({
    'PRCP':'median',
    'TAVG':'mean',
    'TMAX':'mean',
    'TMIN':'mean'
}).reset_index()

df_clima_rs.to_csv(r'..\\data\\processed\\inmet\\rs_final.csv', sep=';', decimal=',')