## Global Historical Climatology Network Dataset
Variables are stored in both rows and columns
This dataset represents the daily weather records for a weather station (MX17004) in Mexico for five months in 2010.

In [2]:
import pandas as pd
import numpy as np
import datetime

In [3]:
df = pd.read_csv('../weather-raw.csv')

In [7]:
df=df[df['month']<=5]

In [10]:
df.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


In [11]:
#Transform column names for days in numbers
day_number = df.columns.str.extract("d(\d+)", expand=False)
day_number=list(day_number.dropna())

In [12]:
print(day_number)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31']


In [13]:
# Add the 4 first columns names to the list
day_number[0:0]=['id','year','month','element']

In [14]:
# Replace column names if the df 
df.columns=day_number

In [15]:
df.columns

Index(['id', 'year', 'month', 'element', '1', '2', '3', '4', '5', '6', '7',
       '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
       '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31'],
      dtype='object')

In [16]:
# Melt dataframe to get temperature by date and month
df_2 = pd.melt(df, id_vars=["id","year", "month","element"], 
                  var_name="day", value_name="Temp")

In [17]:
df_2.head()

Unnamed: 0,id,year,month,element,day,Temp
0,MX17004,2010,1,tmax,1,
1,MX17004,2010,1,tmin,1,
2,MX17004,2010,2,tmax,1,
3,MX17004,2010,2,tmin,1,
4,MX17004,2010,3,tmax,1,


In [18]:
#Create a date column
df_2['date'] = pd.to_datetime(df_2[['year', 'month', 'day']],errors='coerce')

In [19]:
#Drop year,month and day columns
df_2 = df_2.drop(['year',"month","day"], axis=1)

In [20]:
df_2.head()

Unnamed: 0,id,element,Temp,date
0,MX17004,tmax,,2010-01-01
1,MX17004,tmin,,2010-01-01
2,MX17004,tmax,,2010-02-01
3,MX17004,tmin,,2010-02-01
4,MX17004,tmax,,2010-03-01


In [21]:
# Unmelting column "element" to get two columns , tmax and tmin
df_2 = df_2.pivot_table(index=["id","date"], columns="element", values="Temp")


In [22]:
df_2.head()

Unnamed: 0_level_0,element,tmax,tmin
id,date,Unnamed: 2_level_1,Unnamed: 3_level_1
MX17004,2010-01-01,,
MX17004,2010-01-02,,
MX17004,2010-01-03,,
MX17004,2010-01-04,,
MX17004,2010-01-05,,


In [23]:
df_2.reset_index(drop=False,inplace=True)

In [24]:
df_2.head()

element,id,date,tmax,tmin
0,MX17004,2010-01-01,,
1,MX17004,2010-01-02,,
2,MX17004,2010-01-03,,
3,MX17004,2010-01-04,,
4,MX17004,2010-01-05,,
