In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()

In [2]:
sales = pd.read_csv('../data/train.csv', parse_dates=['date'])
keys = pd.read_csv('../data/key.csv')
weather = pd.read_csv('../data/weather.csv', parse_dates=['date'])
df_1 = pd.merge(weather, keys)
df_1 = pd.merge(df_1, sales)

dates = df_1['date'].dt
df_1['year'] = dates.year
df_1['month'] = dates.month
df_1['day'] = dates.day

final_sample = pd.read_csv('../data/01. final_sample')
trimmed = final_sample.iloc[:, 1:]

In [3]:
def item_nbr_tendency_finely(store_nbr, year, month_start=-1, month_end=-1, graph=True):
    '''
    input
        1. store_nbr = 스토어 번호
        2. year = 연도
        3. month_start = 시작달
        4. month_start = 끝달
        5. graph = 위의 정보에 대한 item_nbr 그래프 출력여부
    
    output
        1. store_nbr, year, month로 filtering한 item_nbr의 pivot 테이블
    '''
    store = df_1[(df_1['store_nbr'] == store_nbr) &
                 (df_1['year'] == year)]

    if month_start != -1:
        if month_end == -1:
            month_end = month_start + 1
        store = store[(month_start <= store['month']) & (store['month'] < month_end)]

    pivot = store.pivot_table(index='item_nbr',
                              columns='date',
                              values='units',
                              aggfunc=np.sum)

    zero_index = pivot == 0
    pivot = pivot[pivot != 0].dropna(axis=0, how='all')
    pivot[zero_index] = 0

    if graph:
        plt.figure(figsize=(12, 8))
        sns.heatmap(pivot, cmap="YlGnBu", annot=True, fmt='.0f')
        plt.show()

    return pivot

In [4]:
def weather_tendency(store_nbr, year, month_start = -1, month_end = -1):
    '''
    input:
        위와 같음
        
    output:
        위의 정보로 filtering한 train,key,weather DataFrame
    '''
    store = df_1[(df_1['store_nbr'] == store_nbr) &
                 (df_1['year'] == year)]
    
    if month_start!=-1:
        if month_end == -1:
            month_end = month_start + 1
        store = store[(month_start <= store['month']) & (store['month'] < month_end)]
    
    store = store.drop(labels=['item_nbr','units'],axis=1)
#     store = store.iloc[:,:]
    
    store = store.drop_duplicates(keep='first').reset_index(drop=True)
    
    store.index.name='date'
    store.index = store['date']
        
    return store
    

In [103]:
def get_correlation(store_nbr, year, month_start=-1, month_end=-1):
    '''
    input:
        위와 같음
    output:
        missing, tracing data를 제외한
        팔린 item_nbr별 각 weather feature에 대한 pearsonr, pvalue를 담은 dictionary
    '''

    '''
    Use-case
        1. 7번 store, 2012년 1월부터 12월까지 item_nbr별 상관관계를 알고 싶다
        get_correlation(7,2012)
        
        2. 7번 store, 2012년 1월부터 3월까지 item_nbr별 상관관계를 알고 싶다
        get_correlation(7,2012,1,3)
        
        3. 7번 store, 2012년 5월만 item_nbr별 상관관계를 알고 싶다
        get_correlation(7,2012,5)
    '''
    correlation = dict()

    units_table = item_nbr_tendency_finely(
        store_nbr, year, month_start, month_end, graph=False)
    weather_table = weather_tendency(store_nbr, year, month_start, month_end)

    weather = list(weather_table.columns[3:20])
    weather.remove('codesum')

    item_nbr = units_table.index

    inner = dict()

    for feature in weather:
        for units in item_nbr:

#             print(feature, units)

            a = weather_table[feature].copy()
            b = units_table.loc[units].copy()

            a = a.apply(lambda x: x.replace(' ', ''))
            a = a.apply(lambda x: x.replace('-', ''))

            missing_index = (a.str.contains('M')) | (a.str.contains('T'))
            nothing_index = a.apply(lambda x: len(x)) == 0

            a[missing_index] = np.nan
            b[missing_index] = np.nan

            a[nothing_index] = np.nan
            b[nothing_index] = np.nan
            
            a.dropna(axis=0, inplace=True)
            b.dropna(axis=0, inplace=True)

            inner[(units, feature, len(a))] = sp.stats.pearsonr(a.astype(float), b)

            correlation[(store_nbr, year, month_start, month_end)] = inner

    return correlation

In [98]:
def show_me_pearson(pearson_dict):
    '''
    input:
        get_correlation의 return (상관관계 dictionary)
    output:
        dictionary를 직관적으로 출력함
    '''
    for key,val in pearson_dict.items():
        print(key)
        
        keys = list(val.keys())
        
        keys.sort()
        
        for each in keys:
            print('\t',each,val[each])

In [39]:
store_nbrs = range(7,8)
years=range(2012,2013)
months = range(1,13)

total_pearsonr = dict()

for store_nbr in store_nbrs:
    for year in years:
        for month in months:
#             print(year,month)
            pearsonr = get_correlation(store_nbr,year,month)
            total_pearsonr.update(pearsonr)

  mx = x.mean()
  ret = ret.dtype.type(ret / rcount)
  my = y.mean()
  r = r_num / r_den


In [104]:
show_me_pearson(get_correlation(1,2012))

  mx = x.mean()
  ret = ret.dtype.type(ret / rcount)
  my = y.mean()
  r = r_num / r_den


(1, 2012, -1, -1)
	 (9, 'avgspeed', 364) (0.029413892409414884, 0.5759077562060245)
	 (9, 'cool', 362) (-0.06507347742720462, 0.21678034409391772)
	 (9, 'depart', 0) (nan, nan)
	 (9, 'dewpoint', 363) (-0.16263173725428326, 0.0018798601109756173)
	 (9, 'heat', 362) (0.1962103722989204, 0.00017208151618507012)
	 (9, 'preciptotal', 303) (-0.07761708837800624, 0.17781222909816158)
	 (9, 'resultdir', 363) (0.05088556611367485, 0.33365257830459105)
	 (9, 'resultspeed', 363) (0.048113484264033224, 0.36068909407025174)
	 (9, 'sealevel', 361) (0.10930797729412822, 0.03790572490368731)
	 (9, 'snowfall', 0) (nan, nan)
	 (9, 'stnpressure', 363) (0.10580652394994502, 0.04394758191932755)
	 (9, 'sunrise', 0) (nan, nan)
	 (9, 'sunset', 0) (nan, nan)
	 (9, 'tavg', 362) (-0.1777482034736896, 0.0006805793222412667)
	 (9, 'tmin', 364) (-0.1657085413387121, 0.00151077966541003)
	 (9, 'wetbulb', 360) (-0.16325684039123894, 0.0018855999939363544)
	 (28, 'avgspeed', 364) (0.06836867776596853, 0.1931119899051

In [100]:
def get_correlation(store_nbr, year, month = -1):
    correlation = dict()

    units_table = item_nbr_tendency_finely(store_nbr, year, month, graph=False)
    weather_table = weather_tendency(store_nbr, year, month)

    weather = ['snowfall']

    item_nbr = [9]
    
    
    
    inner=dict()
    
    for feature in weather:
        for units in item_nbr:
            
            print(feature, units)
            
            a = weather_table[feature].copy()
            b = units_table.loc[units].copy()

            a = a.apply(lambda x: x.replace(' ', ''))
            a = a.apply(lambda x: x.replace('-', ''))
            
            print(a,b)
            
            
            missing_index = (a.str.contains('M')) | (a.str.contains('T')) 
            nothing_index = a.apply(lambda x: len(x))==0
            
            print(missing_index)
            
            a[missing_index] = np.nan
            b[missing_index] = np.nan

            a[nothing_index] = np.nan
            b[nothing_index] = np.nan
            
            a.dropna(axis=0, inplace=True)
            b.dropna(axis=0, inplace=True)

            
            print(len(a),len(b))
#             print('\t',a,b)
            
            
            inner[(units, feature)] = sp.stats.pearsonr(a.astype(float), b)
        
            correlation[(store_nbr, year,month)] = inner

    return correlation

In [101]:
tmp = get_correlation(1,2012)

snowfall 9
date
2012-01-01    M
2012-01-02    M
2012-01-03    M
2012-01-04    M
2012-01-05    M
2012-01-06    M
2012-01-07    M
2012-01-08    M
2012-01-09    M
2012-01-10    M
2012-01-11    M
2012-01-12    M
2012-01-13    M
2012-01-14    M
2012-01-15    M
2012-01-16    M
2012-01-17    M
2012-01-18    M
2012-01-19    M
2012-01-20    M
2012-01-21    M
2012-01-22    M
2012-01-23    M
2012-01-24    M
2012-01-25    M
2012-01-26    M
2012-01-27    M
2012-01-28    M
2012-01-29    M
2012-01-30    M
             ..
2012-12-01    M
2012-12-02    M
2012-12-03    M
2012-12-04    M
2012-12-05    M
2012-12-06    M
2012-12-07    M
2012-12-08    M
2012-12-09    M
2012-12-10    M
2012-12-11    M
2012-12-12    M
2012-12-13    M
2012-12-14    M
2012-12-15    M
2012-12-16    M
2012-12-17    M
2012-12-18    M
2012-12-19    M
2012-12-20    M
2012-12-21    M
2012-12-22    M
2012-12-23    M
2012-12-24    M
2012-12-26    M
2012-12-27    M
2012-12-28    M
2012-12-29    M
2012-12-30    M
2012-12-31    M
Name: sn

  mx = x.mean()
  ret = ret.dtype.type(ret / rcount)
  my = y.mean()
  r = r_num / r_den


In [77]:
tmp

{(1, 2012, -1): {(9, 'tmin'): (-0.1657085413387121, 0.00151077966541003)}}

In [54]:
tmp[0].loc['2012-01-01']

''

In [32]:
tmp ='234'
tmp.contains('')

AttributeError: 'str' object has no attribute 'contains'

In [55]:
aa = tmp[0]

In [59]:
aa.apply(lambda x: len(x))==0

date
2012-01-01    True
2012-01-02    True
2012-01-03    True
2012-01-04    True
2012-01-05    True
2012-01-06    True
2012-01-07    True
2012-01-08    True
2012-01-09    True
2012-01-10    True
2012-01-11    True
2012-01-12    True
2012-01-13    True
2012-01-14    True
2012-01-15    True
2012-01-16    True
2012-01-17    True
2012-01-18    True
2012-01-19    True
2012-01-20    True
2012-01-21    True
2012-01-22    True
2012-01-23    True
2012-01-24    True
2012-01-25    True
2012-01-26    True
2012-01-27    True
2012-01-28    True
2012-01-29    True
2012-01-30    True
              ... 
2012-12-01    True
2012-12-02    True
2012-12-03    True
2012-12-04    True
2012-12-05    True
2012-12-06    True
2012-12-07    True
2012-12-08    True
2012-12-09    True
2012-12-10    True
2012-12-11    True
2012-12-12    True
2012-12-13    True
2012-12-14    True
2012-12-15    True
2012-12-16    True
2012-12-17    True
2012-12-18    True
2012-12-19    True
2012-12-20    True
2012-12-21    True
2012-12

In [95]:
a = [(3,6), (1,2),(2,1)]
a.sort()
a

[(1, 2), (2, 1), (3, 6)]