In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from datetime import datetime

In [2]:
# Read data

# train sets
Tmax_train = pd.read_csv(r'train_maxTemp.csv')
Tmin_train = pd.read_csv(r'train_minTemp.csv')

# test sets
Tmax_test = pd.read_csv(r'test_maxTemp.csv')
Tmin_test = pd.read_csv(r'test_minTemp.csv')

# obs values
train_Tmax_obs = pd.read_csv(r'train_label.csv')
test_Tmax_obs = pd.read_csv(r'test_label.csv')

In [3]:
# remove all evening record  (Time = 2 is evening. Time = 1 is morning)

Tmax_train = Tmax_train.loc[Tmax_train['Time'] == 1]
Tmin_train = Tmin_train.loc[Tmin_train['Time'] == 1]
Tmax_test = Tmax_test.loc[Tmax_test['Time'] == 1]
Tmin_test = Tmin_test.loc[Tmin_test['Time'] == 1]

In [4]:
# join data sets by keys: Form, Validity date, City

joined_train = pd.merge(Tmax_train, Tmin_train, on=['Time', 'Validity date', 'City'])
joined_test = pd.merge(Tmax_test, Tmin_test, on=['Time', 'Validity date', 'City'])

In [5]:
# Casting string values to float (the model can fit only float values)

# casting 'Validity date' to datetime.timestamp
def castDates(df):  
    df['Basis date_x'] = df['Basis date_x'].apply(lambda x: (datetime.strptime(x, '%d-%m-%y')).timestamp())
    df['Basis date_y'] = df['Basis date_y'].apply(lambda x: (datetime.strptime(x, '%d-%m-%y')).timestamp())
    df['Validity date'] = df['Validity date'].apply(lambda x: (datetime.strptime(x, '%d-%m-%y')).timestamp())

leCity = preprocessing.LabelEncoder()
leCity.fit(joined_train['City'])

joined_train['City'] = leCity.transform(joined_train['City'])
castDates(joined_train)

joined_test['City'] = leCity.transform(joined_test['City'])
castDates(joined_test)

In [6]:
# Create a linear Regression model
reg = LinearRegression()
reg.fit(joined_train, train_Tmax_obs['observedMaxTemp'])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [7]:
# Prediction

predicted_values = reg.predict(joined_test)

In [8]:
# Accuracy of the model

# Calculate rmse by the difference square of the model predicted values and the actual values (obs)
rmse = np.sqrt(metrics.mean_squared_error(test_Tmax_obs['observedMaxTemp'], predicted_values))

print(rmse)

1.75672391465


In [10]:
# Create a new data frame containing 2 columns:
# 1. key -concatenation of station name and date
# 2. predicted values

dates = joined_test['Validity date'].apply(lambda x: datetime.fromtimestamp(x).strftime('%d-%m-%y') + '_')
cities = leCity.inverse_transform(joined_test['City'])
prediction_data = pd.DataFrame()
prediction_data['validityDate_city'] = dates + cities
prediction_data['predictedMaxTemp'] = predicted_values

prediction_data.to_csv(r'sampleSubmission.csv', index=False)