# Data temperature time-series preprocessing 

Transform data in json

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import folium

%matplotlib inline

## Per country

In [15]:
BASE_FOLDER_TEMP = '../data/climate-change-earth-surface-temperature-data/'
FILENAME = 'GlobalLandTemperaturesByCountry.csv'
df = pd.read_csv(BASE_FOLDER_TEMP + FILENAME)

In [16]:
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,Country
0,1743-11-01,4.384,2.294,Åland
1,1743-12-01,,,Åland
2,1744-01-01,,,Åland
3,1744-02-01,,,Åland
4,1744-03-01,,,Åland


Let's just check how many country we have for each date

In [17]:
date = '1800-01-01'
df[df.dt == date].count()

dt                               76
AverageTemperature               65
AverageTemperatureUncertainty    65
Country                          76
dtype: int64

In [18]:
date = '1850-01-01'
df[df.dt == date].count()

dt                               192
AverageTemperature               185
AverageTemperatureUncertainty    185
Country                          192
dtype: int64

Let's take from 1850, we lack data before that

In [19]:
date = '1850-01-01'
remain = df[df.dt >= date].copy().reset_index()

remain.set_index(['Country'], inplace=True)

# remove useless col
remain.drop(['AverageTemperatureUncertainty','index'], axis=1, inplace=True)

remain['year'] = remain['dt'].str.split('-').apply(lambda x: x[0])

remain.drop(['dt'], axis=1, inplace=True)

remain = remain.groupby(['Country', 'year']).mean()

In [20]:
remain.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,AverageTemperature
Country,year,Unnamed: 2_level_1
Afghanistan,1850,13.326083
Afghanistan,1851,13.605667
Afghanistan,1852,13.541167
Afghanistan,1853,13.455833
Afghanistan,1854,13.60575


In [21]:
dicto = remain.fillna('None').to_dict('dict')
dicto = dicto['AverageTemperature']

all_country = list(set(map(lambda x : x[0] , remain.index.values.tolist())))

new_dico = {}
for country in all_country:
    temps = []
    for i in range(1850, 2014):
        k = (country, str(i))
        if k in dicto and dicto[k] != "None":
            temps.append(dicto[k])
        else:
            temps.append(None)
    new_dico[country] = temps

## Global

In [22]:
FILENAME = 'GlobalTemperatures.csv'
df = pd.read_csv(BASE_FOLDER_TEMP + FILENAME)

In [23]:
df.head()

Unnamed: 0,dt,LandAverageTemperature,LandAverageTemperatureUncertainty,LandMaxTemperature,LandMaxTemperatureUncertainty,LandMinTemperature,LandMinTemperatureUncertainty,LandAndOceanAverageTemperature,LandAndOceanAverageTemperatureUncertainty
0,1750-01-01,3.034,3.574,,,,,,
1,1750-02-01,3.083,3.702,,,,,,
2,1750-03-01,5.626,3.076,,,,,,
3,1750-04-01,8.49,2.451,,,,,,
4,1750-05-01,11.573,2.072,,,,,,


In [24]:
date = '1850-01-01'
remain = df[df.dt >= date].copy().reset_index()


remain = remain[['dt', 'LandAverageTemperature']]
remain['year'] = remain['dt'].str.split('-').apply(lambda x: x[0])
remain.drop(['dt'], axis=1, inplace=True)
remain = remain.groupby('year').mean()

new_dico['World'] = remain.to_dict('list')['LandAverageTemperature']

In [25]:
import json

OUTNAME = 'temp_time_series.json'
OUTPATH = '../app/data'
with open(OUTPATH + OUTNAME, 'w') as f:
    json.dump(new_dico, f, ensure_ascii=False)