### Approach 
* Import the dataframes train and weather.
* Look at the available set of features.
* Build new features using feature interaction.(Couln't do because of time constraint)
* Merge the dataframes train and weather.
* Split the dataset into train and test for each grain

In [44]:
import pandas as pd
import math
import numpy as np

In [13]:
train_df = pd.read_csv('./csv_files/train.csv')
weather_df = pd.read_csv('./csv_files/weather_df.csv')
key_df = pd.read_csv("./csv_files/key.csv")

* Creating grain column

In [14]:
train_df['grain'] = train_df['store_nbr'].astype('str') + '_' + train_df['item_nbr'].astype('str')

In [15]:
train_df.head()

Unnamed: 0,date,store_nbr,item_nbr,units,grain
0,2012-01-01,1,1,0,1_1
1,2012-01-01,1,2,0,1_2
2,2012-01-01,1,3,0,1_3
3,2012-01-01,1,4,0,1_4
4,2012-01-01,1,5,0,1_5


In [22]:
weather_df.drop(columns=['codesum'],inplace=True)

In [24]:
weather_df.columns

Index(['station_nbr', 'date', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'sunrise', 'sunset', 'snowfall',
       'preciptotal', 'stnpressure', 'sealevel', 'resultspeed', 'resultdir',
       'avgspeed', 'codesum_rev'],
      dtype='object')

### Merging the datasets

In [25]:
train_key_merge_df = pd.merge(train_df,key_df,on='store_nbr',how='left')
train_weather_df = pd.merge(train_key_merge_df, weather_df, on = ['station_nbr','date'], how='inner')

In [26]:
train_weather_df.head()

Unnamed: 0,date,store_nbr,item_nbr,units,grain,station_nbr,tmax,tmin,tavg,depart,...,sunrise,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,codesum_rev
0,2012-01-01,1,1,0,1_1,1,52.0,31.0,42.0,-503.825359,...,-1000,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,"['RA', 'FZ', 'FG', 'BR']"
1,2012-01-01,1,2,0,1_2,1,52.0,31.0,42.0,-503.825359,...,-1000,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,"['RA', 'FZ', 'FG', 'BR']"
2,2012-01-01,1,3,0,1_3,1,52.0,31.0,42.0,-503.825359,...,-1000,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,"['RA', 'FZ', 'FG', 'BR']"
3,2012-01-01,1,4,0,1_4,1,52.0,31.0,42.0,-503.825359,...,-1000,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,"['RA', 'FZ', 'FG', 'BR']"
4,2012-01-01,1,5,0,1_5,1,52.0,31.0,42.0,-503.825359,...,-1000,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,"['RA', 'FZ', 'FG', 'BR']"


### Encoding codesum_rev to create extreme and mild weather conditions

In [28]:
def check_weather_conditions(x,is_mild=True):
    if is_mild:
        mild_weather_cond = ['DZ','RA','SN','SG','BR','HZ']
        for weather_cond in mild_weather_cond:
            if weather_cond in x:
                return 1
            else:
                return 0 
    else:
        extreme_weather_cond = ['TS','FC','SS','DS','FG','VA','GR','FU','DU']
        for weather_cond in extreme_weather_cond:
            if weather_cond in x:
                return 1
            else:
                return 0

In [30]:
train_weather_df['extreme_weather'] = train_weather_df['codesum_rev'].apply(lambda x : check_weather_conditions(x,is_mild=False))
train_weather_df['mild_weather'] = train_weather_df['codesum_rev'].apply(lambda x : check_weather_conditions(x))
train_weather_df.drop(columns=['codesum_rev'],inplace=True)

In [31]:
train_weather_df.head()

Unnamed: 0,date,store_nbr,item_nbr,units,grain,station_nbr,tmax,tmin,tavg,depart,...,sunset,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,extreme_weather,mild_weather
0,2012-01-01,1,1,0,1_1,1,52.0,31.0,42.0,-503.825359,...,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,0,0
1,2012-01-01,1,2,0,1_2,1,52.0,31.0,42.0,-503.825359,...,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,0,0
2,2012-01-01,1,3,0,1_3,1,52.0,31.0,42.0,-503.825359,...,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,0,0
3,2012-01-01,1,4,0,1_4,1,52.0,31.0,42.0,-503.825359,...,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,0,0
4,2012-01-01,1,5,0,1_5,1,52.0,31.0,42.0,-503.825359,...,-1000,-201.753454,0.05,29.78,29.92,3.6,20.0,4.6,0,0


### Creating date time features

In [34]:
def create_individual_date_cols(ts):
    # converting to datetime
    ts['day'] = ts.date.dt.day
    ts['wday'] = ts.date.dt.weekday
    ts['year'] = ts.date.dt.year
    ts['month'] = ts.date.dt.month
    ts['week'] = ts.date.dt.isocalendar().week
    return ts

In [36]:
train_weather_df['date'] = pd.to_datetime(train_weather_df['date'])

In [37]:
train_weather_df = create_individual_date_cols(train_weather_df)

### Let's just first try with these data first

#### Splitting the data into train and test for each grain

In [54]:
train_data = pd.DataFrame()
val_data = pd.DataFrame()

In [None]:
grains = train_weather_df['grain'].unique()
index = 0
for grain in grains:
    grain_index = train_weather_df[train_weather_df['grain'] == grain].index 
    train_len = math.floor(len(grain_index)  * 0.9)
    
    train_index = grain_index[:train_len]
    val_index = grain_index[train_len:]
    
    if index == 0:
        train_data = train_weather_df.loc[train_index,:]
        val_data = train_weather_df.loc[val_index,:]
    else:
        train_data = pd.concat([train_data, train_weather_df.loc[train_index,:]])
        val_data = pd.concat([val_data, train_weather_df.loc[val_index,:]])
        
    index += 1
    print("Train data shape : ",train_data.shape)
    print("Validation data shape : ",val_data.shape)

* The Above process of splitting the data took three and a half hours

In [60]:
train_data.to_csv('./csv_files/train_data.csv')
val_data.to_csv('./csv_files/val_data.csv')