## Prepare joined datasets for meteorology challenge

In [4]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from datetime import datetime

import pickle
import os

In [5]:
working_folder = r'C:\Users\User\Documents\DMBI_hackathon_2018\the_weather_channel-master\the_weather_channel-master\kaggle_data'
os.chdir(working_folder)  # change to your own folder

In [6]:
# Read data

# train sets
Tmax_train = pd.read_csv(r'train_maxTemp.csv')
Tmin_train = pd.read_csv(r'train_minTemp.csv')
Wind_train = pd.read_csv(r'train_Wind.csv')
Humidity_train = pd.read_csv(r'train_Humidity.csv')

# test sets
Tmax_test = pd.read_csv(r'test_maxTemp.csv')
Tmin_test = pd.read_csv(r'test_minTemp.csv')
Wind_test = pd.read_csv(r'test_Wind.csv')
Humidity_test = pd.read_csv(r'test_Humidity.csv')

# obs values
train_Tmax_obs = pd.read_csv(r'train_label.csv')
test_Tmax_obs = pd.read_csv(r'test_label.csv')

In [7]:
# join data sets by keys: Form, Validity date, City
# joined_train = pd.merge(Tmax_train, Tmin_train.drop(columns=['Basis date']), on=['Time', 'Validity date', 'City'], suffixes=('_tmax', '_tmin'))


joined_train_1 = pd.merge(Tmax_train, Tmin_train, on=['Time', 'Validity date', 'City'], suffixes=('_tmax', '_tmin'))
joined_train_2 = pd.merge(joined_train_1, Wind_train, on=['Time', 'Validity date', 'City'], suffixes=('', '_wind'))
joined_train = pd.merge(joined_train_2, Humidity_train, on=['Time', 'Validity date', 'City'], suffixes=('', '_humidity'))
joined_train.columns = ['Time', 'Basis date_tmax', 'Validity date', 'City',
       'Persist. value_tmax', 'EC_tmax', 'CO_tmax', 'C3_tmax', 'OH_tmax',
       'Basis date_tmin', 'Persist. value_tmin', 'EC_tmin', 'CO_tmin',
       'C3_tmin', 'OH_tmin', 'Basis date_wind', 'Persist. value_wind', 'EC_wind', 'CO_wind', 'C3_wind',
       'OH_wind', 'Basis date_humidity', 'Persist. value_humidity', 'EC_humidity',
       'C3_humidity']

joined_test = pd.merge(Tmax_test, Tmin_test, on=['Time', 'Validity date', 'City'], suffixes=('_tmax', '_tmin'))
joined_test = pd.merge(joined_test, Wind_test, on=['Time', 'Validity date', 'City'], suffixes=('', '_wind'))
joined_test = pd.merge(joined_test, Humidity_test, on=['Time', 'Validity date', 'City'], suffixes=('', '_humidity'))
joined_test.columns = ['Time', 'Basis date_tmax', 'Validity date', 'City',
       'Persist. value_tmax', 'EC_tmax', 'CO_tmax', 'C3_tmax', 'OH_tmax',
       'Basis date_tmin', 'Persist. value_tmin', 'EC_tmin', 'CO_tmin',
       'C3_tmin', 'OH_tmin', 'Basis date_wind', 'Persist. value_wind', 'EC_wind', 'CO_wind', 'C3_wind',
       'OH_wind', 'Basis date_humidity', 'Persist. value_humidity', 'EC_humidity',
       'C3_humidity']

In [8]:
# Casting string values to float (the model can fit only float values)

# casting 'Validity date' to datetime.timestamp
def castDates(df):
    df['Validity date'] = df['Validity date'].apply(lambda x: (datetime.strptime(x, '%d-%m-%y')).timestamp())


def merge12(df):
    df1 = df[df['Time']== 1]
    df2 = df[df['Time']== 2]
    merged_df = pd.merge(df1, df2, on=['Validity date', 'City'], suffixes=('_1', '_2'))
    merged_df.drop(['Time_1', 'Time_2'], axis=1, inplace=True)
    return merged_df


def drop_basis(df):
    return df.drop([x for x in df.columns if x.startswith('Basis date')], axis=1)


def prepare_and_process_set(df):
    df['City'] = leCity.transform(df['City'])
    castDates(df)
    joined_12 = merge12(df)
    joined_12 = drop_basis(joined_12)
    return joined_12
    
    

leCity = preprocessing.LabelEncoder()
leCity.fit(joined_train['City'])
joined_train_12 = prepare_and_process_set(joined_train)
joined_test_12 = prepare_and_process_set(joined_test)
# joined_train['City'] = leCity.transform(joined_train['City'])
# castDates(joined_train)

# joined_test['City'] = leCity.transform(joined_test['City'])
# castDates(joined_test)



In [9]:
joined_train_12.to_pickle('joined_train_12_df.pickle')
joined_test_12.to_pickle('joined_test_12_df.pickle')

### Create timeseries

In [11]:
def merge_n_days(df, n):
    """Merge n days prior to each Validation Date in df. Returns df"""
    shifted_list = [df.groupby(['City']).shift(-1*i) for i in range(1, n+1)]
    for i in range(n):
        back = -1*(i+1)
        df = df.join(shifted_list[i].rename(columns=lambda x: x+f"_{back}"))
        df.drop(f'Validity date_{back}', axis=1, inplace=True)
    return df

train_df_ts_list = [merge_n_days(joined_train_12, i) for i in range(1, 10)]
test_df_ts_list = [merge_n_days(joined_test_12, i) for i in range(1, 10)]

### Train Model

In [12]:
# Create a linear Regression model
reg = LinearRegression()
# reg.fit(joined_train, train_Tmax_obs['observedMaxTemp'])
reg.fit(joined_train_12, train_Tmax_obs['observedMaxTemp'])  # only for 1

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [13]:
predicted_values = reg.predict(joined_test_12)

In [14]:
# Accuracy of the model

# Calculate rmse by the difference square of the model predicted values and the actual values (obs)
rmse = np.sqrt(metrics.mean_squared_error(test_Tmax_obs['observedMaxTemp'], predicted_values))

print(rmse)

# old: 1.75672391465

1.65650385966


### Save for submission

In [10]:
# Create a new data frame containing 2 columns:
# 1. key -concatenation of station name and date
# 2. predicted values

# dates = joined_test['Validity date'].apply(lambda x: datetime.fromtimestamp(x).strftime('%d-%m-%y') + '_')
# cities = leCity.inverse_transform(joined_test['City'])
# prediction_data = pd.DataFrame()
# prediction_data['validityDate_city'] = dates + cities
# prediction_data['predictedMaxTemp'] = predicted_values

# prediction_data.to_csv(r'sampleSubmission.csv', index=False)