In [2]:
%matplotlib inline
import glob
import os
import pandas as pd
import json
import numpy as np
import scipy.stats as ss
from statsmodels.tsa.stattools import grangercausalitytests, adfuller
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')


# Influence Graph 

Figure below represts the impact graph out-degree and in-degree probability distribution which we will use below for influence metric.

![](image/impact_graph.png)

# Processing CSV from influence graph

In [7]:
hour = 18
all_files = glob.glob(os.path.join(os.getcwd() + "/graphs/", "power*%sH.csv" % hour))
dfs = []
for filename in all_files:
        dfs.append(pd.read_csv(filename))

In [8]:
df = pd.DataFrame(columns=['in','out', 'influence'])
for dt_frame in dfs:
    df = df.merge(dt_frame.loc[:,['in','out','influence']], how = 'outer', on = ['in','out'])

Find all columns with NAN

In [9]:
df.isna().all()

influence_x     True
in             False
out            False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
influence_x    False
influence_y    False
dtype: bool

Remove the 0 columns where all values are NAN

In [10]:
df = df.iloc[:,1:]

Change NaN values to 0

In [11]:
df = df.fillna(0)


In [12]:
df.head()

Unnamed: 0,in,out,influence_y,influence_x,influence_y.1,influence_x.1,influence_y.2,influence_x.2,influence_y.3,influence_x.3,influence_y.4,influence_x.4,influence_y.5
0,1236745500,5389246700,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4039511540,808884817,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,111586252,3202476750,0.3672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,111586252,298086258,0.245238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,111586252,1393986237,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Get the lines which have more than 5 non zero values.

In [13]:
sample = df[(df.iloc[:,2:] > 0).sum(axis=1) >2]

In [14]:
sample

Unnamed: 0,in,out,influence_y,influence_x,influence_y.1,influence_x.1,influence_y.2,influence_x.2,influence_y.3,influence_x.3,influence_y.4,influence_x.4,influence_y.5
100,111586252,4039511540,0.0,0.0,0.0,0.0,0.319487,0.118272,0.190113,0.0,0.098551,0.080409,0.100484


# Augmented Dickey-Fuller test
We interpret this result using the p-value from the test. A p-value below a threshold (such as 5% or 1%) suggests we reject the null hypothesis (stationary), otherwise a p-value above the threshold suggests we fail to reject the null hypothesis (non-stationary).

p-value > 0.05: Fail to reject the null hypothesis (H0), the data has a unit root and is non-stationary.
p-value <= 0.05: Reject the null hypothesis (H0), the data does not have a unit root and is stationary.

The code below select the stationary ones

In [15]:
sample[sample.iloc[:,2:].apply(lambda x: adfuller(x.values)[1], axis=1) <= 0.05]

Unnamed: 0,in,out,influence_y,influence_x,influence_y.1,influence_x.1,influence_y.2,influence_x.2,influence_y.3,influence_x.3,influence_y.4,influence_x.4,influence_y.5
100,111586252,4039511540,0.0,0.0,0.0,0.0,0.319487,0.118272,0.190113,0.0,0.098551,0.080409,0.100484


# Granger Causality Test
The Null hypothesis for grangercausalitytests is that the time series in the second column, x2, does NOT Granger cause the time series in the first column, x1. Grange causality means that past values of x2 have a statistically significant effect on the current value of x1, taking past values of x1 into account as regressors. We reject the null hypothesis that x2 does not Granger cause x1 if the pvalues are below a desired size of the test.

In [16]:

def granger_total(df_g, t_slice, x, y, hour):
    x_tmp = df_g[(df_g.poi_id==x) & (df_g.hour==hour)].groupby(pd.Grouper(freq='W'))['hour'].count()
    y_tmp = df_g[(df_g.poi_id==y) & (df_g.hour==hour)].groupby(pd.Grouper(freq='W'))['hour'].count()
    x1 = np.nan_to_num(ss.zscore(x_tmp[t_slice].fillna(0)))
    y1 = np.nan_to_num(ss.zscore(y_tmp[t_slice].fillna(0)))
    tuples = list(zip(x1,y1))
    gc = grangercausalitytests(tuples, 2, addconst=True, verbose=False)
    return gc[1][0]['params_ftest'][1]

In [17]:
df_g = pd.read_pickle('poi_tw.pkl')
w = df_g.groupby(pd.Grouper(freq='W'))['hour'].count().index

## Removing irrelevants
Removing irrelevant POI using a Poisson distribution with mean two time the number os weeks. In a conservative way, we will consider POI that in average had at least two visitors per week.

In [18]:
(df_g.index.max() - df_g.index.min()).days / 7

10.142857142857142

In [19]:
pt = ss.poisson(11)
pt.cdf(6)

0.07861437209313321

In [20]:
pt.cdf(5)

0.037519814101927236

## Processing granger for number of posts per POI

In [22]:
sample['p-value'] = sample.apply(lambda row: granger_total(df_g, w, row['in'], row['out'], hour), axis=1)



In [23]:
sample.loc[sample['p-value'] <= 0.05, ['in','out','p-value']]


Unnamed: 0,in,out,p-value


## Processing granger for number of unique users per POI

In [24]:

def granger_unique(df_g, t_slice, x, y, hour):
    x_tmp = df_g[(df_g.poi_id==x) & (df_g.hour==hour)].groupby(pd.Grouper(freq='W')).user.nunique()
    y_tmp = df_g[(df_g.poi_id==y) & (df_g.hour==hour)].groupby(pd.Grouper(freq='W')).user.nunique()
    x1 = np.nan_to_num(ss.zscore(x_tmp[t_slice].fillna(0)))
    y1 = np.nan_to_num(ss.zscore(y_tmp[t_slice].fillna(0)))
    tuples = list(zip(x1,y1))
    gc = grangercausalitytests(tuples, 2, addconst=True, verbose=False)
    return gc[1][0]['params_ftest'][1]

In [25]:
sample['p-value'] = sample.apply(lambda row: granger_unique(df_g, w, row['in'], row['out'], hour), axis=1)

In [26]:
sample.loc[sample['p-value'] <= 0.05, ['in','out','p-value']]

Unnamed: 0,in,out,p-value
