In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def preprocessing(df):
    df.columns = ['uuid','flover_raw','time']
    df['time'] = pd.to_datetime(df['time'])
    df['flover'] = df['flover_raw'].apply(lambda x:int(x[x.find('r')+1:]))
    df['date'] = df['time'].apply(lambda x:pd.datetime.date(x))
    ndf = df[['flover','time','date']]
    ndf.index = df['time']
    return ndf

In [3]:
list_of_files_train = ['data_2015_1.txt','data_2015_2.txt','data_2015_3.txt','data_2015_4.txt','data_2015_5.txt']
list_of_files_validation = ['data_2015_12.txt','data_2016_1.txt']

lists_of_dataframe_train = [pd.read_table(''+list_of_files_train[i],header=None) for i in range(len(list_of_files_train))]
lists_of_dataframe_validation = [pd.read_table(''+list_of_files_validation[i],header=None) for i in range(len(list_of_files_validation))]

In [4]:
df_train = pd.concat(lists_of_dataframe_train,axis=0,ignore_index=True)
df_validation = pd.concat(lists_of_dataframe_validation,axis=0,ignore_index=True)
df_train = preprocessing(df_train)
df_validation = preprocessing(df_validation)

In [5]:
def transform(df):
    flavors_unique = np.sort(np.array(df['flover'].unique(),dtype=np.int))
    start_date = df['date'][0]
    end_date = df['date'][df.shape[0]-1]
    observation = pd.DataFrame(index=pd.date_range(start_date,end_date),columns=flavors_unique).fillna(0)
    for i in flavors_unique:
        observation[i] = df[df['flover']==i][['flover','date']].groupby('date').count()
    observation = observation.fillna(0)
    return observation

In [32]:
training_data = transform(df_train)
validation_data = transform(df_validation)[training_data.columns]

In [23]:
training_data = training_data.resample('W').sum()
validation_data = validation_data.resample('W').sum()
validation_data = validation_data[training_data.columns] # filter some flavors not show in the training data

In [37]:
A = np.array(training_data)
B = np.array(validation_data)

In [38]:
print(A.shape)

(150, 20)


In [39]:
B.shape

(62, 20)

In [41]:
C = np.vstack([A,B])

In [42]:
C

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 2.,  1.,  0., ...,  0.,  0.,  0.],
       [ 3.,  1.,  0., ...,  0.,  0.,  0.],
       [ 3.,  0.,  0., ...,  0.,  0.,  0.]])

In [46]:
np.corrcoef(C.T).tolist()

[[1.0,
  0.3864598855912417,
  -0.009544644574521201,
  0.274886324285082,
  0.1978978632724188,
  0.02830907228726231,
  0.12588200781942965,
  0.23625952125201502,
  0.13805464668177433,
  0.15103521290669467,
  0.181011690746071,
  0.32080895989257024,
  0.010326963687308855,
  0.014664592608102756,
  0.11434157725677735,
  0.0851520551784896,
  0.10833250808525338,
  0.10924299096972448,
  0.5238607885469417,
  0.22718731111391788],
 [0.38645988559124167,
  0.9999999999999999,
  0.059362525118935326,
  0.1865750135065568,
  0.16138208059982895,
  0.021291894849950536,
  0.10132564101677756,
  0.29323759035646646,
  0.31903470502413145,
  0.01712156830284112,
  0.16282226300988334,
  0.3970952876060367,
  -0.050003176784202456,
  -0.018088607525179833,
  0.1163802465354962,
  0.161369505127079,
  0.03641942264688228,
  0.09525376879751193,
  0.20100286829743624,
  0.021668866959631505],
 [-0.009544644574521201,
  0.059362525118935326,
  1.0,
  -0.021889581289532133,
  -0.05996440404