In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
import gc
import os
import random
import lightgbm as lgb
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import datetime
%matplotlib inline

In [6]:
# normal reduce memory usage function
def reduce_mem_usage(df, use_float16 = False):
    """
    Iterate through all the columns of a dataframe and modify the data type to reduce memory usage.        
    """
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [10]:
# add timestamp and reduce memory
def df_process(df, reduce = True, use_float16 = False):
    try:
        df["timestamp"] = pd.to_datetime(df["timestamp"], format = '%Y-%m-%d %H:%M:%S')
    except:
        print('There is no timestamp column')
    # reduce memory
    if reduce:
        df = reduce_mem_usage(df, use_float16 = use_float16)
    return df

In [37]:
# fill weather by group of site day and month and use the mean to fill
def fill_weather_dataset(weather_df):
    
    # Find Missing Dates
    time_format = "%Y-%m-%d %H:%M:%S"
    start_date = datetime.datetime.strptime(weather_df['timestamp'].min(),time_format)
    end_date = datetime.datetime.strptime(weather_df['timestamp'].max(),time_format)
    total_hours = int(((end_date - start_date).total_seconds() + 3600) / 3600)
    hours_list = [(end_date - datetime.timedelta(hours=x)).strftime(time_format) for x in range(total_hours)]

    missing_hours = []
    for site_id in range(16):
        site_hours = np.array(weather_df[weather_df['site_id'] == site_id]['timestamp'])
        new_rows = pd.DataFrame(np.setdiff1d(hours_list,site_hours),columns=['timestamp'])
        new_rows['site_id'] = site_id
        weather_df = pd.concat([weather_df,new_rows])

        weather_df = weather_df.reset_index(drop=True)           

    # Add new Features
    weather_df["datetime"] = pd.to_datetime(weather_df["timestamp"])
    weather_df["day"] = weather_df["datetime"].dt.day
    weather_df["week"] = weather_df["datetime"].dt.week
    weather_df["month"] = weather_df["datetime"].dt.month
    

    weather_df["timestamp"] = pd.to_datetime(weather_df["timestamp"], format='%Y-%m-%d %H:%M:%S')

    weather_df['local_time'] = weather_df['timestamp']
    
    # Reset Index for Fast Update
    weather_df = weather_df.set_index(['site_id','day','month'])

    air_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
    weather_df.update(air_temperature_filler,overwrite=False)

    # Step 1
    cloud_coverage_filler = weather_df.groupby(['site_id','day','month'])['cloud_coverage'].mean()
    # Step 2
    cloud_coverage_filler = pd.DataFrame(cloud_coverage_filler.fillna(method='ffill'),columns=["cloud_coverage"])

    weather_df.update(cloud_coverage_filler,overwrite=False)

    due_temperature_filler = pd.DataFrame(weather_df.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
    weather_df.update(due_temperature_filler,overwrite=False)

    # Step 1
    sea_level_filler = weather_df.groupby(['site_id','day','month'])['sea_level_pressure'].mean()
    # Step 2
    sea_level_filler = pd.DataFrame(sea_level_filler.fillna(method='ffill'),columns=['sea_level_pressure'])

    weather_df.update(sea_level_filler,overwrite=False)

    wind_direction_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_direction'].mean(),columns=['wind_direction'])
    weather_df.update(wind_direction_filler,overwrite=False)

    wind_speed_filler =  pd.DataFrame(weather_df.groupby(['site_id','day','month'])['wind_speed'].mean(),columns=['wind_speed'])
    weather_df.update(wind_speed_filler,overwrite=False)

    # Step 1
    precip_depth_filler = weather_df.groupby(['site_id','day','month'])['precip_depth_1_hr'].mean()
    # Step 2
    precip_depth_filler = pd.DataFrame(precip_depth_filler.fillna(method='ffill'),columns=['precip_depth_1_hr'])

    weather_df.update(precip_depth_filler,overwrite=False)

    weather_df = weather_df.reset_index()
    weather_df = weather_df.drop(['datetime','day','week','month'],axis=1)
        
    return weather_df

In [38]:
df_train_init_weather = df_process(fill_weather_dataset(pd.read_csv('../Resources/weather_train.csv')), use_float16=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


Memory usage of dataframe is 10.72 MB
Memory usage after optimization is: 4.16 MB
Decreased by 61.2%


In [12]:
# process the data
df_train = df_process(pd.read_csv('../Resources/train.csv'), use_float16 = True)
df_building = df_process(pd.read_csv('../Resources/building_metadata.csv'), use_float16 = True)
# df_train_weather = df_process(pd.read_csv('../Large_output/csv/train_weather_filled_1.csv'))

Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 289.19 MB
Decreased by 53.1%
There is no timestamp column
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.8%


In [13]:
df_train = df_train.loc[df_train['building_id'] != 1099,:]
df_train = df_train.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

In [14]:
df_train_merge = df_train.merge(df_building, on='building_id', how='left')

In [56]:
# add the hodliday and log the square feet and use label encoder to change primary_use to numeric
def prepare_data(X, test=False, has_month = True):
    X.sort_values("timestamp")
    X.reset_index(drop=True)
    gc.collect()
    X.local_time = pd.to_datetime(X.local_time, format="%Y-%m-%d %H:%M:%S")
    holidays = ["2016-01-01", "2016-01-18", "2016-02-15", "2016-05-30", "2016-07-04",
                    "2016-09-05", "2016-10-10", "2016-11-11", "2016-11-24", "2016-12-26",
                    "2017-01-02", "2017-01-16", "2017-02-20", "2017-05-29", "2017-07-04",
                    "2017-09-04", "2017-10-09", "2017-11-10", "2017-11-23", "2017-12-25",
                    "2018-01-01", "2018-01-15", "2018-02-19", "2018-05-28", "2018-07-04",
                    "2018-09-03", "2018-10-08", "2018-11-12", "2018-11-22", "2018-12-25",
                    "2019-01-01"]    
    X["is_holiday"] = (X.local_time.isin(holidays)).astype(int)
    le = LabelEncoder()
    X['primary_use'] = le.fit_transform(X['primary_use'])
    X.timestamp = pd.to_datetime(X.timestamp, format="%Y-%m-%d %H:%M:%S")
    X["hour"] = X.local_time.dt.hour
    X["weekend"] = X.local_time.dt.weekday
    X["month"] = X.local_time.dt.month
    X['square_feet']=np.log1p(X['square_feet'])
    drop_features = ['timestamp', 'floor_count', 'year_built', 'local_time',
                     'wind_speed', 'wind_direction', 'sea_level_pressure']
    if not has_month:
        drop_features.append('month')
    X.drop(drop_features, axis = 1, inplace = True)
    if test:
        row_ids = X.row_id
        X.drop("row_id", axis=1, inplace=True)
        return X, row_ids
    else:
        y = np.log1p(X.meter_reading)
        X.drop("meter_reading", axis=1, inplace=True)
        return X, y

In [57]:
df_train_1 = pd.merge(df_train_merge, df_train_init_weather,
                      on=['site_id', 'timestamp'], how='left')
X, y = prepare_data(df_train_1)
X.to_csv('../../../Large_output/init_train_merge.csv')

0           3.190624
1           0.318163
2           0.000000
3           5.171529
4           4.524668
              ...   
19852315    2.277267
19852316    1.762159
19852317    0.000000
19852318    5.078761
19852319    1.348073
Name: meter_reading, Length: 19852320, dtype: float32
