In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()

In [2]:
sales = pd.read_csv('../data/train.csv', parse_dates=['date'])
keys = pd.read_csv('../data/key.csv')
weather = pd.read_csv('../data/weather.csv', parse_dates=['date'])
df_1 = pd.merge(weather, keys)
df_1 = pd.merge(df_1, sales)

dates = df_1['date'].dt
df_1['year'] = dates.year
df_1['month'] = dates.month
df_1['day'] = dates.day

final_sample = pd.read_csv('../data/01. final_sample')
trimmed = final_sample.iloc[:, 1:]

In [3]:
def item_nbr_tendency_finely(store_nbr, year, month,graph=True):
    store = df_1[(df_1['store_nbr'] == store_nbr) &
                 (df_1['year'] == year) &
                 (df_1['month'] == month)]

    pivot = store.pivot_table(index='item_nbr',
                              columns='day',
                              values='units',
                              aggfunc=np.sum)
    zero_index = pivot == 0
    pivot = pivot[pivot != 0].dropna(axis=0, how='all')
    pivot[zero_index] = 0

    if graph:
        plt.figure(figsize=(12, 8))
        sns.heatmap(pivot, cmap="YlGnBu", annot=True, fmt='.0f')
        plt.show()
    
    return pivot

In [4]:
def weather_tendency(store_nbr, year, month):
    store = df_1[(df_1['store_nbr'] == store_nbr) &
                 (df_1['year'] == year) &
                 (df_1['month'] == month)]
    
    store = store.drop(labels=['item_nbr','units'],axis=1)
#     store = store.iloc[:,:]
    
    store = store.drop_duplicates(keep='first').reset_index(drop=True)
    
    store.index.name='day'
    
    store.index = store['day'].astype(int)
    
    return store
    

In [279]:
weather = weather_tendency(7,2012,12)
units = item_nbr_tendency_finely(7,2012,12,graph=False)

In [281]:
a = weather['tmax'].copy()
b = units.loc[5].copy()

missing_index = (a.str.contains('M')) | (a.str.contains('T'))

a[missing_index]=np.nan
b[missing_index]=np.nan

a.dropna(axis=0,inplace=True)
b.dropna(axis=0,inplace=True)

sp.stats.pearsonr(a.astype(float),b)

(0.0086810018764450388, 0.9643520046708256)

In [76]:
def get_correlation(store_nbr, year, month):
    correlation = dict()

    store = df_1[(df_1['store_nbr'] == store_nbr) &
                 (df_1['year'] == year) &
                 (df_1['month'] == month)]

    units_table = item_nbr_tendency_finely(store_nbr, year, month, graph=False)
    weather_table = weather_tendency(store_nbr, year, month)

    weather = list(weather_table.columns[3:20])
    weather.remove('codesum')

    item_nbr = units_table.index

#     print(weather)
#     print(item_nbr)

    inner=dict()
    
    for feature in weather:
        for units in item_nbr:
            a = weather_table[feature].copy()
            b = units_table.loc[units].copy()

            missing_index = (a.str.contains('M')) | (a.str.contains('T')) | (a.str.contains('-')) | (a.str.contains(' '))

            a[missing_index] = np.nan
            b[missing_index] = np.nan

            a.dropna(axis=0, inplace=True)
            b.dropna(axis=0, inplace=True)

#             coef = list(sp.stats.pearsonr(a.astype(float), b))
#             coef[0] = round(coef[0],3)
#             coef[1] = round(coef[1],3)
            
            inner[(units, feature)] = sp.stats.pearsonr(a.astype(float), b)
        
            correlation[(store_nbr, year,month)] = inner

#             print(year, month, feature, units)
#             print('\t',sp.stats.pearsonr(a.astype(float),b))
    return correlation

In [232]:
%%timeit
tmp = get_correlation(7,2012,1)

  r = r_num / r_den
  x = np.where(x < 1.0, x, 1.0)  # if x > 1 then return 1.0


644 ms ± 5.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# 2012, 2013, 2014 총 3년
# 1~12월 총 12월
# 1~45 총 45개 store
# 3 * 12 * 45 = 1620

# 한번 도는데 644ms
# 1620 * 644ms = 1043s
# 대략 17분정도 걸림

In [77]:
store_nbrs = range(7,8)
years=range(2012,2013)
months = range(1,13)

total_pearsonr = dict()

for store_nbr in store_nbrs:
    for year in years:
        for month in months:
#             print(year,month)
            pearsonr = get_correlation(store_nbr,year,month)
            total_pearsonr.update(pearsonr)

  mx = x.mean()
  ret = ret.dtype.type(ret / rcount)
  my = y.mean()
  r = r_num / r_den
  x = np.where(x < 1.0, x, 1.0)  # if x > 1 then return 1.0


In [80]:
total_pearsonr

{(7, 2012, 1): {(5, 'avgspeed'): (0.042503717135183479, 0.82040189729590707),
  (5, 'cool'): (nan, nan),
  (5, 'depart'): (0.5119663121698399, 0.24013397151512353),
  (5, 'dewpoint'): (-0.038734594632172083, 0.83610364758253508),
  (5, 'heat'): (0.049238567017267074, 0.7961076749769358),
  (5, 'preciptotal'): (-0.23237068019040477, 0.26367087127759092),
  (5, 'resultdir'): (0.029941837657235609, 0.87296464327166134),
  (5, 'resultspeed'): (0.027809605650566827, 0.95829634513944617),
  (5, 'sealevel'): (0.21280438859026965, 0.25039967381534584),
  (5, 'snowfall'): (nan, 1.0),
  (5, 'stnpressure'): (0.19124688902855597, 0.30272299760175786),
  (5, 'sunrise'): (0.30414078916986109, 0.096218697876837678),
  (5, 'sunset'): (-0.44479194839150554, 0.012171401581882318),
  (5, 'tavg'): (-0.028323235385296647, 0.88189258527814574),
  (5, 'tmin'): (0.16571074870469474, 0.37298007072755035),
  (5, 'wetbulb'): (-0.028420239502636703, 0.87937231278533601),
  (7, 'avgspeed'): (0.072051447542321512, 

In [79]:
total_pearsonr[(7,2012,1)]

{(5, 'avgspeed'): (0.042503717135183479, 0.82040189729590707),
 (5, 'cool'): (nan, nan),
 (5, 'depart'): (0.5119663121698399, 0.24013397151512353),
 (5, 'dewpoint'): (-0.038734594632172083, 0.83610364758253508),
 (5, 'heat'): (0.049238567017267074, 0.7961076749769358),
 (5, 'preciptotal'): (-0.23237068019040477, 0.26367087127759092),
 (5, 'resultdir'): (0.029941837657235609, 0.87296464327166134),
 (5, 'resultspeed'): (0.027809605650566827, 0.95829634513944617),
 (5, 'sealevel'): (0.21280438859026965, 0.25039967381534584),
 (5, 'snowfall'): (nan, 1.0),
 (5, 'stnpressure'): (0.19124688902855597, 0.30272299760175786),
 (5, 'sunrise'): (0.30414078916986109, 0.096218697876837678),
 (5, 'sunset'): (-0.44479194839150554, 0.012171401581882318),
 (5, 'tavg'): (-0.028323235385296647, 0.88189258527814574),
 (5, 'tmin'): (0.16571074870469474, 0.37298007072755035),
 (5, 'wetbulb'): (-0.028420239502636703, 0.87937231278533601),
 (7, 'avgspeed'): (0.072051447542321512, 0.70010318746201294),
 (7, 'coo