In [1]:
import os
import string
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns
from sklearn import datasets
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [2]:
sys.path.append("../../src/.")
from preprocess_data import data_transform_pipeline

In [3]:
DIR = "../../data/"
SUBWAYUS = "Subway USA/subway_usa_"

In [4]:
demographic = pd.read_csv(DIR + SUBWAYUS + "demographic_variables.csv")
stores = pd.read_csv(DIR + SUBWAYUS + "stores.csv")
poi_variables = pd.read_csv(DIR + SUBWAYUS + "poi_variables.csv")
sister = pd.read_csv(DIR + SUBWAYUS + "competition_sister_variables.csv")
trade_area = pd.read_csv(DIR + SUBWAYUS + "trade_area_variables.csv")

In [5]:
merged = stores.merge(
    poi_variables, on="store"
).merge(
    trade_area, on="store"
)

In [6]:
from preprocess_data import is_climate_feat

def drop_specific_columns(df):
    all_cols = df.columns.tolist()
    keep_columns = []
    
    # drop features with corresponding percentage measures
    percent_feats = [col for col in all_cols if "_p_" in col]
    remove_feats = ["_".join(feat.split("_p_")) for feat in percent_feats]

    for col in all_cols:
        if col in remove_feats:
            continue
        if "centerxy" in col:
            if "full" not in col and "effective" not in col:
                continue
        if is_climate_feat(col):
            continue
        # remove sports venues columns (seems to be all zeros)
        if "sports_venues" in col:
            continue
        if col.startswith('edu') and not col.startswith('edu_bachplus_p'):
            continue
        keep_columns.append(col)
    print(f'----- Removing {len(all_cols) - len(keep_columns)} columns -----')
    reduced_df = df[keep_columns]
    return reduced_df

In [7]:
merged = drop_specific_columns(merged)

----- Removing 95 columns -----


In [8]:
train_df, test_df = train_test_split(merged, test_size=0.1, random_state=42)

In [10]:
def agg_veh(df, columns=[]):
    if len(columns) == 0:
        columns = [
            'hh_0vehicle_p_ta', 'hh_1vehicle_p_ta', 'hh_2vehicle_p_ta',
            'hh_3vehicle_p_ta', 'hh_4vehicle_p_ta', 'hh_5vehicle_p_ta', 
        ]
    df['hh_expected_vehicle_ta'] = df['hh_1vehicle_p_ta'] * 1 
    + df['hh_2vehicle_p_ta'] * 2 
    + df['hh_3vehicle_p_ta'] * 3 
    + df['hh_4vehicle_p_ta'] * 4 
    + df['hh_5vehicle_p_ta'] * 5
    df.drop(columns=columns, inplace=True)

agg_veh(train_df)
agg_veh(test_df)

In [11]:
def agg_hh_pers(df, columns=[]):
    if len(columns) == 0:
        columns = [
            'hh_1pers_p_ta', 'hh_2pers_p_ta', 'hh_3pers_p_ta',
            'hh_4pers_p_ta', 'hh_5pers_p_ta', 'hh_6pers_p_ta', 
            'hh_7pers_p_ta'
        ]
    df['hh_expected_pers_ta'] = df['hh_1pers_p_ta'] * 1 
    + df['hh_2pers_p_ta'] * 2 
    + df['hh_3pers_p_ta'] * 3 
    + df['hh_4pers_p_ta'] * 4 
    + df['hh_5pers_p_ta'] * 5
    + df['hh_6pers_p_ta'] * 6
    + df['hh_7pers_p_ta'] * 7
    df.drop(columns=columns, inplace=True)

agg_hh_pers(train_df)
agg_hh_pers(test_df)

In [12]:
train_index = train_df['store']
test_index = test_df['store']

In [13]:
drop_features = [
    "store", 
    "longitude", 
    "latitude", 
    # "__store_latitude",
    # "__store_longitude",
    # "__batch_group",
    # "__errors",
    # "__store_bg",
    # "__success",
]

ordinal_features_oth = [
    "market_size",
    "store_density",
]
ordering_ordinal_oth = [
    ["Very Large Metro (1)", "Large Metro (2)", "Large City (3)", "Medium City (4)", "Small City (5)", "Small Town (6)", "Small Community (7)"],
    ["Rural", "Exurban", "Suburban", "Light Suburban", "Light Urban", "Urban", "Super Urban"],
]
categorical_features = ["cbsa_name", "dma_name", "censusdivision", "censusregion"]

numeric_features = list(set(train_df.select_dtypes(include=np.number).columns.tolist()) - set(drop_features))

In [14]:
processed_train, processed_test = data_transform_pipeline(
    train_df, 
    test_df, 
    train_index,
    test_index,
    drop_features + categorical_features, 
    ordinal_features_oth, 
    ordering_ordinal_oth, 
    [], 
    numeric_features
)

In [17]:
processed_train.shape

(12944, 198)

In [18]:
processed_train.to_csv(DIR + SUBWAYUS + "processed_train.csv")
processed_test.to_csv(DIR + SUBWAYUS + "processed_test.csv")