In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()

In [2]:
sales = pd.read_csv('../data/train.csv', parse_dates=['date'])
keys = pd.read_csv('../data/key.csv')
weather = pd.read_csv('../data/weather.csv', parse_dates=['date'])
df_1 = pd.merge(weather, keys)
df_1 = pd.merge(df_1, sales)

dates = df_1['date'].dt
df_1['year'] = dates.year
df_1['month'] = dates.month
df_1['day'] = dates.day

final_sample = pd.read_csv('../data/01. final_sample')
trimmed = final_sample.iloc[:, 1:]

In [41]:
def item_nbr_tendency_finely(store_nbr, year, month=-1, graph=True):
    store = df_1[(df_1['store_nbr'] == store_nbr) &
                 (df_1['year'] == year)]

    if month != -1:
        store = store[store['month'] == month]
        token = 'day'

    else:
        token = 'date'

    pivot = store.pivot_table(index='item_nbr',
                              columns=token,
                              values='units',
                              aggfunc=np.sum)

    zero_index = pivot == 0
    pivot = pivot[pivot != 0].dropna(axis=0, how='all')
    pivot[zero_index] = 0

    if graph:
        plt.figure(figsize=(12, 8))
        sns.heatmap(pivot, cmap="YlGnBu", annot=True, fmt='.0f')
        plt.show()

    return pivot

In [42]:
def weather_tendency(store_nbr, year, month=-1):
    store = df_1[(df_1['store_nbr'] == store_nbr) &
                 (df_1['year'] == year)]
    
    if month!=-1:
        store = store[store['month']==month]
    
    store = store.drop(labels=['item_nbr','units'],axis=1)
#     store = store.iloc[:,:]
    
    store = store.drop_duplicates(keep='first').reset_index(drop=True)
    
    if month != -1:
        store.index.name='day'
        store.index = store['day'].astype(int)

    else:
        store.index.name='date'
        store.index = store['date']
        
    return store
    

In [38]:
def get_correlation(store_nbr, year, month):
    correlation = dict()

    units_table = item_nbr_tendency_finely(store_nbr, year, month, graph=False)
    weather_table = weather_tendency(store_nbr, year, month)

    weather = list(weather_table.columns[3:20])
    weather.remove('codesum')

    item_nbr = units_table.index

    inner=dict()
    
    for feature in weather:
        for units in item_nbr:
            a = weather_table[feature].copy()
            b = units_table.loc[units].copy()

            missing_index = (a.str.contains('M')) | (a.str.contains('T')) | (a.str.contains('-')) | (a.str.contains(' '))

            a[missing_index] = np.nan
            b[missing_index] = np.nan

            a.dropna(axis=0, inplace=True)
            b.dropna(axis=0, inplace=True)

            inner[(units, feature)] = sp.stats.pearsonr(a.astype(float), b)
        
            correlation[(store_nbr, year,month)] = inner

    return correlation

In [78]:
def get_correlation_year(store_nbr,year):
    
    correlation = dict()

    units_table = item_nbr_tendency_finely(store_nbr,year,-1,graph=False)
    weather_table = weather_tendency(store_nbr,year,-1)
    
    weather = list(weather_table.columns[3:20])
    weather.remove('codesum')

    item_nbr = units_table.index

#     print(weather)
#     print(item_nbr)

    inner=dict()
    
    for feature in weather:
        for units in item_nbr:
            a = weather_table[feature].copy()
            b = units_table.loc[units].copy()

            missing_index = (a.str.contains('M')) | (a.str.contains('T')) | (a.str.contains('-')) | (a.str.contains(' '))

            a[missing_index] = np.nan
            b[missing_index] = np.nan

            a.dropna(axis=0, inplace=True)
            b.dropna(axis=0, inplace=True)

#             coef = list(sp.stats.pearsonr(a.astype(float), b))
#             coef[0] = round(coef[0],3)
#             coef[1] = round(coef[1],3)
            
            inner[(units, feature)] = sp.stats.pearsonr(a.astype(float), b)
        
            correlation[(store_nbr, year)] = inner

#             print(year, month, feature, units)
#             print('\t',sp.stats.pearsonr(a.astype(float),b))
    return correlation

In [79]:
tmp = get_correlation_year(7,2012)

  r = r_num / r_den


In [80]:
tmp

{(7, 2012): {(5, 'avgspeed'): (0.007701007672754401, 0.8835877232206184),
  (5, 'cool'): (0.15931682957566326, 0.04555753753336412),
  (5, 'depart'): (0.1405005791655457, 0.2972089836562284),
  (5, 'dewpoint'): (-0.07568296514313719, 0.14901153042300766),
  (5, 'heat'): (0.19739142794896797, 0.00016024297777646098),
  (5, 'preciptotal'): (0.005492044163217863, 0.9226570376007677),
  (5, 'resultdir'): (-0.02066557484459113, 0.6939479807728344),
  (5, 'resultspeed'): (-0.04791485592891599, 0.7092147208292058),
  (5, 'sealevel'): (0.11301900344383121, 0.030870363476181134),
  (5, 'snowfall'): (nan, 1.0),
  (5, 'stnpressure'): (0.1100190187502882, 0.03588900909169614),
  (5, 'sunrise'): (0.1079989467534099, 0.03918155102035344),
  (5, 'sunset'): (-0.013822858594614783, 0.7924025289760432),
  (5, 'tavg'): (-0.12137801589834839, 0.02107049389731149),
  (5, 'tmin'): (-0.07751359130214516, 0.13994389337994312),
  (5, 'wetbulb'): (-0.09331104635663716, 0.0754007127544108),
  (7, 'avgspeed'): (0

In [39]:
store_nbrs = range(7,8)
years=range(2012,2013)
months = range(1,13)

total_pearsonr = dict()

for store_nbr in store_nbrs:
    for year in years:
        for month in months:
#             print(year,month)
            pearsonr = get_correlation(store_nbr,year,month)
            total_pearsonr.update(pearsonr)

  mx = x.mean()
  ret = ret.dtype.type(ret / rcount)
  my = y.mean()
  r = r_num / r_den
