# Связь в линейной регрессии

In [None]:
import numpy as np
import networkx as nx
import matplotlib.pylab as plt
%matplotlib inline

Смоделируем зависимости,соответствующие типу Fork. Заметим, что здесь есть только одна переменная, которая является экзогенной (Z), то есть она не зависит ни от какой другой переменной в системе (не зависит от X, Y, эндогенных переменных).
```
Z->X, Z->Y, X->Y
```

In [None]:
rs = np.random.RandomState(42)
N = 10000
Z = rs.randn(10000)
X = 0.5*Z + rs.randn(10000) 
Y = 0.3*Z + 0.4*X + rs.randn(10000) 

Сделайте две регрессии Y ~ X, Y ~ Z + X. Что можно сказать о 95% доверительном интервале (confint) на коэффициент регрессии перед X в двух этих моделях?

In [None]:
from statsmodels.regression.linear_model import OLS
l1 = OLS(Y,np.vstack([X,Z]).T).fit()
l1.summary()

In [None]:
from statsmodels.regression.linear_model import OLS
l1 = OLS(Y,np.vstack([X]).T).fit()
l1.summary()

Рассмотрим collider:
```
X -> Z
X -> Y
Y -> Z
```

In [None]:
X = rs.randn(N)
Y = 0.7*X + rs.randn(N)
Z = 1.2*X + 0.6*Y+ rs.randn(N)

In [None]:
from statsmodels.regression.linear_model import OLS
l1 = OLS(Y,np.vstack([X,Z]).T).fit()
l1.summary()

In [None]:
from statsmodels.regression.linear_model import OLS
l1 = OLS(Y,np.vstack([X]).T).fit()
l1.summary()

# DAG’s

In [None]:
from causalgraphicalmodels import CausalGraphicalModel

In [None]:
G = CausalGraphicalModel(nodes=['X','Y','Z','W','T'], edges=[('X','Y'),
('Y','Z'),
('Z','T'),
('X','W'),
('W','Y'),
('Y','T'),
('W','T')])
G.draw()

Пути из X в T

In [None]:
list(nx.all_simple_paths(G.dag, 'X', 'T'))

In [None]:
list(nx.all_simple_paths(G.dag.to_undirected(), 'X', 'T'))

In [None]:
G.get_all_independence_relationships()

In [None]:
G.get_distribution()

In [None]:
G.do('W').draw()

In [None]:
G.draw()

In [None]:
G.get_all_backdoor_adjustment_sets('Y','X')

In [None]:
G.get_all_frontdoor_adjustment_sets('Y','X')

# Inductive search

Допустим, у нас есть несколько событий:

1. Продажа мороженного (ICE)
2. Количество преступлений (CRIMES)
3. Количество полицейских на тысячу человек (POLICE)
4. Средняя температура (TEMP)
5. Количество скачиваний браузера IE (IE)
6. Количество зараженных компьютеров (COMP)
7. Уровень загрязнения воздуха (AIR)

In [None]:
G = CausalGraphicalModel(nodes=['ICE','CRIMES','POLICE', 'TEMP','IE','COMP', 'AIR'], 
edges=[('TEMP','ICE'),  ('TEMP','CRIMES'), ('IE','COMP'), ('POLICE', 'CRIMES')])
G.draw()

In [None]:
from causality.inference.search import IC
from causality.inference.independence_tests import MutualInformationTest
import pandas as pd
import seaborn as sns


In [None]:
police = rs.uniform(low=1, high=100, size=1000)
temp = rs.uniform(low=-40, high=40, size=1000)
air = rs.uniform(size=1000)
crimes = (temp+40)*1.0/police
ice = temp+40
ie = rs.uniform(size=1000)
comp = ie**2
dataframe = pd.DataFrame({'police':police, 'temp':temp, 'air':air, 'crimes':crimes, 'ice':ice, 'ie':ie, 'comp':comp})
sns.pairplot(dataframe)

In [None]:
import statsmodels.api as sm
class OLS_test():
    def __init__(self, y, x, z, data, alpha):
        self.regression = sm.OLS.from_formula('{0}~{1}'.format(y[0], u'+'.join(x+z)),dataframe)
        self.result = self.regression.fit()
        self.x = x
        self.y = y
        self.z = z
        self.alpha = alpha
    def independent(self):        
        to_fisher = u' '.join([x_+'=0' for x_ in self.x])
        return self.result.f_test(to_fisher).pvalue>self.alpha

In [None]:
variable_types = {'police' : 'd', 
                  'temp' : 'd',
                  'air' : 'c',
                  'ice' : 'd',
                  'crimes' : 'd',
                 'ie':'c',
                 'comp':'c'}

ic_algorithm = IC(OLS_test, alpha=0.05)
graph = ic_algorithm.search(dataframe, variable_types, )

In [None]:
nx.draw_networkx(graph)

In [None]:
for e in graph.edges(data=True):
    print (e)

# Задача про курение

In [None]:
smoke = []
cancer = []
tar = []

smoke+=[1]*323
cancer+=[0]*323
tar+=[1]*323

smoke+=[0]
cancer+=[0]
tar+=[1]

smoke+=[1]*57
cancer+=[1]*57
tar+=[1]*57

smoke+=[0]*19
cancer+=[1]*19
tar+=[1]*19


smoke+=[1]*18
cancer+=[0]*18
tar+=[0]*18

smoke+=[0]*38
cancer+=[0]*38
tar+=[0]*38

smoke+=[1]*2
cancer+=[1]*2
tar+=[0]*2

smoke+=[0]*342
cancer+=[1]*342
tar+=[0]*342

dataframe = pd.DataFrame({'smoke':smoke, 'tar':tar, 'cancer':cancer})
dataframe.sample(5)






In [None]:
sns.pairplot(dataframe)

In [None]:
g = nx.DiGraph()

In [None]:
g.add_nodes_from(['smoke','cancer','tar'])
g.add_edges_from([('tar','cancer'),('smoke','tar')])


In [None]:
nx.draw_networkx(g)

In [None]:
from causality.estimation.adjustments import AdjustForDirectCauses
from causality.estimation.nonparametric import CausalEffect
adjustment = AdjustForDirectCauses()
admissable_set = adjustment.admissable_set(g,['smoke'], ['cancer'])
admissable_set

In [None]:
effect = CausalEffect(dataframe.sample(299), ['smoke'], ['cancer'], 
                      variable_types={'smoke':'u', 'cancer':'u', 'tar':'u'}, 
                      admissable_set=[])

In [None]:
smoke =  pd.DataFrame({'smoke' : [1],   'cancer':[1]})
nsmoke =  pd.DataFrame({'smoke' : [0],  'cancer':[1]})

In [None]:
effect.pdf(smoke) - effect.pdf(nsmoke)