In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def preprocessing(df):
    df.columns = ['uuid','flover_raw','time']
    df['time'] = pd.to_datetime(df['time'])
    df['flover'] = df['flover_raw'].apply(lambda x:int(x[x.find('r')+1:]))
    df['date'] = df['time'].apply(lambda x:pd.datetime.date(x))
    ndf = df[['flover','time','date']]
    ndf.index = df['time']
    return ndf

In [3]:
list_of_files_train = ['data_2015_1.txt','data_2015_2.txt','data_2015_3.txt','data_2015_4.txt','data_2015_5.txt']
list_of_files_validation = ['data_2015_12.txt','data_2016_1.txt']

lists_of_dataframe_train = [pd.read_table(''+list_of_files_train[i],header=None) for i in range(len(list_of_files_train))]
lists_of_dataframe_validation = [pd.read_table(''+list_of_files_validation[i],header=None) for i in range(len(list_of_files_validation))]

In [4]:
df_train = pd.concat(lists_of_dataframe_train,axis=0,ignore_index=True)
df_validation = pd.concat(lists_of_dataframe_validation,axis=0,ignore_index=True)
df_train = preprocessing(df_train)
df_validation = preprocessing(df_validation)

In [5]:
def transform(df):
    flavors_unique = np.sort(np.array(df['flover'].unique(),dtype=np.int))
    start_date = df['date'][0]
    end_date = df['date'][df.shape[0]-1]
    observation = pd.DataFrame(index=pd.date_range(start_date,end_date),columns=flavors_unique).fillna(0)
    for i in flavors_unique:
        observation[i] = df[df['flover']==i][['flover','date']].groupby('date').count()
    observation = observation.fillna(0)
    return observation

In [6]:
training_data = transform(df_train)
validation_data = transform(df_validation)[training_data.columns]

In [7]:
training_data = training_data.resample('W').sum()
validation_data = validation_data.resample('W').sum()
validation_data = validation_data[training_data.columns] # filter some flavors not show in the training data

In [32]:
A = np.array(training_data)
B = np.array(validation_data)

In [38]:
np.corrcoef(A.T)

array([[ 1.        ,  0.33129769,  0.33016814,  0.19155354,  0.70891461,
         0.61670384,  0.34175466,  0.32564942,  0.25128425,  0.28414469,
         0.29137978,  0.19281501,  0.47742019, -0.00998223, -0.25540678,
         0.11576546, -0.14670447, -0.17580539,  0.19277428, -0.19155354],
       [ 0.33129769,  1.        ,  0.02194666, -0.01199397,  0.41295423,
         0.29361189,  0.41168127,  0.31867436,  0.47661378, -0.04737817,
         0.04200767, -0.16509541, -0.12050601,  0.07653419, -0.15537925,
         0.06948808, -0.13997382, -0.20112201,  0.05060574, -0.15763504],
       [ 0.33016814,  0.02194666,  1.        ,  0.06494688,  0.28533621,
         0.3541134 , -0.03363254,  0.13021838,  0.10109506,  0.10126093,
         0.09250055,  0.1803598 ,  0.17755128,  0.03326699, -0.11703461,
        -0.08622573,  0.0480586 , -0.04910451, -0.15753306,  0.02823777],
       [ 0.19155354, -0.01199397,  0.06494688,  1.        ,  0.097012  ,
         0.23181779, -0.28008176,  0.24359054, -

In [33]:
from sklearn.preprocessing import normalize

In [34]:
A_new = normalize(A,axis=0,norm='l2')

In [35]:
np.corrcoef(A_new.T)

array([[ 1.        ,  0.33129769,  0.33016814,  0.19155354,  0.70891461,
         0.61670384,  0.34175466,  0.32564942,  0.25128425,  0.28414469,
         0.29137978,  0.19281501,  0.47742019, -0.00998223, -0.25540678,
         0.11576546, -0.14670447, -0.17580539,  0.19277428, -0.19155354],
       [ 0.33129769,  1.        ,  0.02194666, -0.01199397,  0.41295423,
         0.29361189,  0.41168127,  0.31867436,  0.47661378, -0.04737817,
         0.04200767, -0.16509541, -0.12050601,  0.07653419, -0.15537925,
         0.06948808, -0.13997382, -0.20112201,  0.05060574, -0.15763504],
       [ 0.33016814,  0.02194666,  1.        ,  0.06494688,  0.28533621,
         0.3541134 , -0.03363254,  0.13021838,  0.10109506,  0.10126093,
         0.09250055,  0.1803598 ,  0.17755128,  0.03326699, -0.11703461,
        -0.08622573,  0.0480586 , -0.04910451, -0.15753306,  0.02823777],
       [ 0.19155354, -0.01199397,  0.06494688,  1.        ,  0.097012  ,
         0.23181779, -0.28008176,  0.24359054, -

In [36]:
A_new = normalize(A,axis=0,norm='l1')

In [37]:
np.corrcoef(A_new.T)

array([[ 1.        ,  0.33129769,  0.33016814,  0.19155354,  0.70891461,
         0.61670384,  0.34175466,  0.32564942,  0.25128425,  0.28414469,
         0.29137978,  0.19281501,  0.47742019, -0.00998223, -0.25540678,
         0.11576546, -0.14670447, -0.17580539,  0.19277428, -0.19155354],
       [ 0.33129769,  1.        ,  0.02194666, -0.01199397,  0.41295423,
         0.29361189,  0.41168127,  0.31867436,  0.47661378, -0.04737817,
         0.04200767, -0.16509541, -0.12050601,  0.07653419, -0.15537925,
         0.06948808, -0.13997382, -0.20112201,  0.05060574, -0.15763504],
       [ 0.33016814,  0.02194666,  1.        ,  0.06494688,  0.28533621,
         0.3541134 , -0.03363254,  0.13021838,  0.10109506,  0.10126093,
         0.09250055,  0.1803598 ,  0.17755128,  0.03326699, -0.11703461,
        -0.08622573,  0.0480586 , -0.04910451, -0.15753306,  0.02823777],
       [ 0.19155354, -0.01199397,  0.06494688,  1.        ,  0.097012  ,
         0.23181779, -0.28008176,  0.24359054, -