In [1]:
%matplotlib inline
import glob
import os
import pandas as pd
import json
import numpy as np
import scipy.stats as ss
from statsmodels.tsa.stattools import grangercausalitytests, adfuller
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')


# Influence Graph 

Figure below represts the impact graph out-degree and in-degree probability distribution which we will use below for influence metric.

![](image/impact_graph.png)

# Processing CSV from influence graph

In [211]:
all_files = glob.glob(os.path.join(os.getcwd() + "/graphs/", "power*.csv"))
dfs = []
for filename in all_files:
        dfs.append(pd.read_csv(filename))

In [212]:
df = pd.DataFrame(columns=['in','out', 'influence'])
for dt_frame in dfs:
    df = df.merge(dt_frame.loc[:,['in','out','influence']], how = 'outer', on = ['in','out'])

Find all columns with NAN

In [213]:
df.isna().all()

influence_x     True
in             False
out            False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence      False
dtype: bool

Remove the 0 columns where all values are NAN

In [218]:
df = df.iloc[:,1:]

Change NaN values to 0

In [219]:
df = df.fillna(0)


In [220]:
df.head()

Unnamed: 0,in,out,influence_y,influence_x,influence_y.1,influence_x.1,influence_y.2,influence_x.2,influence_y.3,influence_x.3,...,influence_y.4,influence_x.4,influence_y.5,influence_x.5,influence_y.6,influence_x.6,influence_y.7,influence_x.7,influence_y.8,influence
0,2710170995,2502987507,0.360535,0.143726,0.0,0.0,0.200852,0.0,0.0,0.160888,...,0.0,0.0,0.0,0.127478,0.0,0.0,0.0,0.0,0.212275,0.0
1,2710170995,2502987507,0.360535,0.143726,0.0,0.0,0.200852,0.0,0.0,0.160888,...,0.0,0.0,0.0,0.127478,0.0,0.0,0.0,0.0,0.212275,0.0
2,2710170995,5391525088,0.26825,0.081559,0.0,0.0,0.204614,0.0,0.0,0.148925,...,0.0,0.0,0.0,0.100639,0.0,0.0,0.0,0.0,0.155618,0.0
3,2710170995,5391525088,0.26825,0.081559,0.0,0.0,0.204614,0.0,0.0,0.148925,...,0.0,0.0,0.0,0.100639,0.0,0.0,0.0,0.0,0.155618,0.0
4,5391525088,2502987507,0.319732,0.25492,0.421865,0.145326,0.178121,0.781884,0.0,0.14268,...,0.319217,0.238871,0.5,0.113051,0.0,0.237681,0.313837,0.246049,0.0,0.238871


In [222]:
df.drop_duplicates(inplace=True)

Get the lines which have more than 5 non zero values.

In [229]:
sample = df[(df.iloc[:,2:] > 0).sum(axis=1) >2]

In [230]:
sample

Unnamed: 0,in,out,influence_y,influence_x,influence_y.1,influence_x.1,influence_y.2,influence_x.2,influence_y.3,influence_x.3,...,influence_y.4,influence_x.4,influence_y.5,influence_x.5,influence_y.6,influence_x.6,influence_y.7,influence_x.7,influence_y.8,influence
0,2710170995,2502987507,0.360535,0.143726,0.000000,0.000000,0.200852,0.000000,0.0,0.160888,...,0.000000,0.000000,0.000000,0.127478,0.0,0.000000,0.000000,0.000000,0.212275,0.000000
2,2710170995,5391525088,0.268250,0.081559,0.000000,0.000000,0.204614,0.000000,0.0,0.148925,...,0.000000,0.000000,0.000000,0.100639,0.0,0.000000,0.000000,0.000000,0.155618,0.000000
4,5391525088,2502987507,0.319732,0.254920,0.421865,0.145326,0.178121,0.781884,0.0,0.142680,...,0.319217,0.238871,0.500000,0.113051,0.0,0.237681,0.313837,0.246049,0.000000,0.238871
16,5391525088,2502987507,0.319732,0.254920,0.421865,0.145326,0.178121,0.781884,0.0,0.142680,...,0.319217,0.238871,0.500000,0.226101,0.0,0.237681,0.313837,0.246049,0.000000,0.238871
221188,5391525088,2504506706,0.299195,0.338414,0.241845,0.230015,0.557856,0.000000,0.0,0.557856,...,0.000000,0.155130,0.000000,0.557856,0.0,0.000000,0.188465,0.000000,0.000000,0.241845
221189,5391525088,2710170995,0.129209,0.338158,0.000000,0.000000,0.000000,0.000000,0.0,0.332250,...,0.000000,0.000000,0.000000,0.332250,0.0,0.000000,0.000000,0.000000,0.185155,0.000000
221191,226615262,2710170995,0.152075,0.398002,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.329862
221193,226615262,227542955,0.107133,0.098210,0.000000,0.099713,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.082561,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
221194,226615262,167875833,0.280131,0.711418,0.000000,0.000000,0.224878,0.000000,0.0,0.256841,...,0.345280,0.215963,0.000000,0.000000,0.0,0.000000,0.000000,0.267131,0.000000,0.000000
221196,2504506708,4961765347,0.283472,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.331454,0.153702,0.000000,0.000000,0.0,0.000000,0.200354,0.000000,0.000000,0.000000


# Augmented Dickey-Fuller test
We interpret this result using the p-value from the test. A p-value below a threshold (such as 5% or 1%) suggests we reject the null hypothesis (stationary), otherwise a p-value above the threshold suggests we fail to reject the null hypothesis (non-stationary).

p-value > 0.05: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.

The code below select the stationary ones

In [231]:
sample[sample.iloc[:,2:].apply(lambda x: adfuller(x.values)[1], axis=1) <= 0.05]

Unnamed: 0,in,out,influence_y,influence_x,influence_y.1,influence_x.1,influence_y.2,influence_x.2,influence_y.3,influence_x.3,...,influence_y.4,influence_x.4,influence_y.5,influence_x.5,influence_y.6,influence_x.6,influence_y.7,influence_x.7,influence_y.8,influence
0,2710170995,2502987507,0.360535,0.143726,0.000000,0.000000,0.200852,0.000000,0.000000,0.160888,...,0.000000,0.000000,0.000000,0.127478,0.0,0.000000,0.000000,0.000000,0.212275,0.000000
2,2710170995,5391525088,0.268250,0.081559,0.000000,0.000000,0.204614,0.000000,0.000000,0.148925,...,0.000000,0.000000,0.000000,0.100639,0.0,0.000000,0.000000,0.000000,0.155618,0.000000
221188,5391525088,2504506706,0.299195,0.338414,0.241845,0.230015,0.557856,0.000000,0.000000,0.557856,...,0.000000,0.155130,0.000000,0.557856,0.0,0.000000,0.188465,0.000000,0.000000,0.241845
221189,5391525088,2710170995,0.129209,0.338158,0.000000,0.000000,0.000000,0.000000,0.000000,0.332250,...,0.000000,0.000000,0.000000,0.332250,0.0,0.000000,0.000000,0.000000,0.185155,0.000000
221191,226615262,2710170995,0.152075,0.398002,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.329862
221193,226615262,227542955,0.107133,0.098210,0.000000,0.099713,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.082561,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
221194,226615262,167875833,0.280131,0.711418,0.000000,0.000000,0.224878,0.000000,0.000000,0.256841,...,0.345280,0.215963,0.000000,0.000000,0.0,0.000000,0.000000,0.267131,0.000000,0.000000
221196,2504506708,4961765347,0.283472,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.331454,0.153702,0.000000,0.000000,0.0,0.000000,0.200354,0.000000,0.000000,0.000000
221197,4961765347,5391525088,0.009073,0.002759,0.004883,0.003030,0.006921,0.000000,0.000000,0.005037,...,0.000000,0.006636,0.000000,0.003404,0.0,0.004025,0.003111,0.000000,0.000000,0.008405
221200,4961765347,167875833,0.011115,0.014113,0.015431,0.012264,0.008922,0.000000,0.000000,0.010190,...,0.013699,0.017137,0.045729,0.008798,0.0,0.023922,0.015020,0.010599,0.034201,0.019341


# Granger Causality Test
The Null hypothesis for grangercausalitytests is that the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1. Grange causality means that past values of x2 have a statistically significant effect on the current value of x1, taking past values of x1 into account as regressors. We reject the null hypothesis that x2 does not Granger cause x1 if the pvalues are below a desired size of the test.

In [232]:

def granger_total(df_g, t_slice, x, y):
    x_tmp = df_g[(df_g.poi_id==x) ].groupby(pd.Grouper(freq='W'))['hour'].count()
    y_tmp = df_g[(df_g.poi_id==y) ].groupby(pd.Grouper(freq='W'))['hour'].count()
    x1 = np.nan_to_num(ss.zscore(x_tmp[t_slice].fillna(0)))
    y1 = np.nan_to_num(ss.zscore(y_tmp[t_slice].fillna(0)))
    tuples = list(zip(x1,y1))
    gc = grangercausalitytests(tuples, 2, addconst=True, verbose=False)
    return gc[1][0]['params_ftest'][1]

In [233]:
df_g = pd.read_pickle('poi_tw_pgh.pkl')
group = df_g.groupby(['user']).count()
df_g = df_g[df_g.user.isin(group[(group.poi_id>6)][3:].index)]
w = df_g.groupby(pd.Grouper(freq='W'))['hour'].count().index

## Removing irrelevants
Removing irrelevant POI using a Poisson distribution with mean two time the number os weeks. In a conservative way, we will consider POI that in average had at least two visitors per week.

In [234]:
(df_g.index.max() - df_g.index.min()).days / 7

24.857142857142858

In [235]:
pt = ss.poisson(11)
pt.cdf(6)

0.07861437209313321

In [236]:
pt.cdf(5)

0.037519814101927236

## Processing granger for number of posts per POI

In [237]:
sample['p-value'] = sample.apply(lambda row: granger_total(df_g, w, row['in'], row['out']), axis=1)


In [238]:
sample.loc[sample['p-value'] <= 0.05, ['in','out','p-value']]


Unnamed: 0,in,out,p-value
221188,5391525088,2504506706,0.001222
230416,2502987507,2710170995,0.028761
241077,167875833,1371572669,0.014472
241411,2502987507,261990580,0.00382
241412,2502987507,2504506706,0.006957
241413,5391525088,261990580,0.000848
241420,535066420,227542955,0.025835
241465,166903180,4961765347,0.001304


## Processing granger for number of unique users per POI

In [245]:

def granger_unique(df_g, t_slice, x, y):
    x_tmp = df_g[(df_g.poi_id==x) ].groupby(pd.Grouper(freq='W'))['user'].nunique()
    y_tmp = df_g[(df_g.poi_id==y) ].groupby(pd.Grouper(freq='W'))['user'].nunique()
    x1 = np.nan_to_num(ss.zscore(x_tmp[t_slice].fillna(0)))
    y1 = np.nan_to_num(ss.zscore(y_tmp[t_slice].fillna(0)))
    tuples = list(zip(x1,y1))
    gc = grangercausalitytests(tuples, 2, addconst=True, verbose=False)
    return gc[1][0]['params_ftest'][1]

In [246]:
sample['p-value'] = sample.apply(lambda row: granger_unique(df_g, w, row['in'], row['out']), axis=1)

ValueError: ('wrong shape for coefs', 'occurred at index 221188')

In [62]:
sample.loc[sample['p-value'] <= 0.05, ['in','out','p-value']]

Unnamed: 0,in,out,p-value
