In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# function for calculating entropy in bits

def entropy(a):
    a = np.asarray(a)
    a = a/a.sum()
    return -np.sum(a*np.log2(a))

# 1a

For a satndard six-side die, there are 6 possible outcome with probability 1/6 for each one,

$H(X) = -\sum_x P(x)log_2P(x) = 6\times (-\frac{1}{6}log_2\frac{1}{6})=2.58 (bits)$

# 1b
$H(X) = -\sum_x P(x)log_2P(x) = 2\times (-0.35\times log_2 0.35)+2\times (-0.15\times log_2 0.15) = 1.88 (bits)$

# 1c

Since the two dice are independent, there are 6*6=36 outcomes with equal probability 1/36

$H(X,Y) = -\sum_{x,y} P(x,y)log_2 P(x,y) = 36\times (-\frac{1}{36}log_2\frac{1}{36}) = 5.16 (bits) $

# 1d
By the following properties: 1. $\sum_{x}P(x) = \sum_{y}P(y) = 1$ for all random variables. 2. $P(x,y) = P(x)P(y)$ for independent variables X,Y, we can derive $H(X,Y) = H(X) + H(Y)$ as follows:

\begin{align}
H(X, Y) & = -\sum_{x, y} p(x, y)\log_2(p(x, y)) \\ 
& = -\sum_{x, y} p(x) p(y)\log_2(p(x)p(y)) \\ 
& = -\sum_{x, y} p(x) p(y)\left[\log_2(p(x)) + \log_2(p(y))\right] \\ 
& = -\sum_{x, y} p(x) p(y)\log_2(p(x)) -\sum_{x, y} p(x) p(y) \log_2(p(y)) \\ 
& = -\sum_{x} p(x) \log_2(p(x)) -\sum_{y}p(y) \log_2(p(y))\\ 
& = H(X) + H(Y)
\end{align}

As a result, $I(X;Y) = H(X)+H(Y)-H(X,Y) = 0$

# 1e
Similar as 1d, we can seperate $H(x_1,x_2,...,x_N)$ into $H(x_1) + H(x_2) + ... + H(x_N)$ as

\begin{align}
H(x_1,x_2,...,x_N) &= -\sum_{x_1,x_2,...,x_N} P(x_1,x_2,...,x_N)log_2 P(x_1,x_2,...,x_N) \\
       &= -\sum_{x_1,x_2,...,x_N}P(x_1)P(x_2)...P(x_N)log_2 (P(x_1)P(x_2)...P(x_N)) \\
       & = -\sum_{x_1,x_2,...,x_N}P(x_1)P(x_2)...P(x_N)\left[\log_2(p(x_1)) + \log_2(p(x_2)) +...+ \log_2(p(x_N))\right] \\ 
       &= -\sum_{x_2}P(x_2)...\sum_{x_N}P(x_N)\sum_{x_1}P(x_1)log_2 P(x_1) -\sum_{x_1}P(x_1)...\sum_{x_N}P(x_N)\sum_{x_2}P(x_2)log_2 P(x_2) - ... -\sum_{x_1}P(x_1)\sum_{x_2}P(x_2)...\sum_{x_N}P(x_N)log_2 P(x_N) \\
       &= H(x_1) + H(x_2) +...+H(x_N)
\end{align}

As a result, the joint entropy for N dice rolls is $N*(-log\frac{1}{6}) = 2.58N (bits)$


In [3]:
# 1f

mono = pd.read_table('english_monograms.txt', header=None, sep=' ', index_col=0)
mono['Prob'] = mono[1]/sum(mono[1])
mono['-PlogP'] = -mono['Prob']*np.log2(mono['Prob'])
mono_entropy = sum(mono['-PlogP'])
 # you can also use the previously defined function entropy(mono) to get the same result as I demonstrate for 1i

print("The entropy per letter is ",mono_entropy, 
      "bits. This makes sense because the maximum entropy for a letter is -log2(1/26) = 4.7 bits when all the alphabet are with equal probability")

The entropy per letter is  4.184600828019136 bits. This makes sense because the maximum entropy for a letter is -log2(1/26) = 4.7 bits when all the alphabet are with equal probability


In [4]:
# 1g

print("Since the assumtion is that every letter is independent, H(L1,L2) = H(L1) + H(L2) = 4.184*2 = 8.368 (bits)." 
      "Note that H(L) means entropy for a single letter")

Since the assumtion is that every letter is independent, H(L1,L2) = H(L1) + H(L2) = 4.184*2 = 8.368 (bits).Note that H(L) means entropy for a single letter


In [5]:
# 1h

print(mono)
print("Base on the table above, we can calculate the probability of getting randomly generated combination: ")
print("QU and UQ is the P(U)*P(Q) = P(Q)*P(U) = 0.001040*0.026816 = {0: .2g}".format(mono.loc['Q','Prob']*mono.loc['U','Prob']))
print("HT and TH is the P(H)*P(T) = P(T)*P(H) = 0.089381*0.049557 = {0: .2g}".format(mono.loc['T','Prob']*mono.loc['H','Prob']))

bigram = pd.read_table('english_bigrams.txt', header=None, sep=' ', index_col=0)
bigram['Prob'] = bigram[1]/sum(bigram[1])
print("The actual probabilties are P(TH) ={0: .3g}, P(QU) ={1: .3g}, which are enriched and P(HT) ={2: .3g}, P(UQ) ={3: .3g},these are depleted."
      .format(bigram.loc['TH','Prob'],bigram.loc['QU','Prob'], bigram.loc['HT','Prob'],bigram.loc['UQ','Prob'],))

           1      Prob    -PlogP
0                               
E  529117365  0.120965  0.368622
T  390965105  0.089381  0.311394
A  374061888  0.085517  0.303384
O  326627740  0.074673  0.279520
I  320410057  0.073251  0.276231
N  313720540  0.071722  0.272647
S  294300210  0.067282  0.261972
R  277000841  0.063327  0.252107
H  216768975  0.049557  0.214818
L  183996130  0.042065  0.192288
D  169330528  0.038712  0.181600
C  138416451  0.031644  0.157649
U  117295780  0.026816  0.139999
M  110504544  0.025263  0.134067
F   95422055  0.021815  0.120387
G   91258980  0.020863  0.116478
P   90376747  0.020662  0.115641
W   79843664  0.018254  0.105427
Y   75294515  0.017214  0.100877
B   70195826  0.016048  0.095669
V   46337161  0.010593  0.069500
K   35373464  0.008087  0.056206
J    9613410  0.002198  0.019406
X    8369915  0.001914  0.017278
Z    4975847  0.001138  0.011125
Q    4550166  0.001040  0.010308
Base on the table above, we can calculate the probability of getting randoml

In [6]:
# 1i

bi_entropy = entropy(bigram)
print("The actual entropy for bigram is{0: .3g} (bits) which is smaller than the independent assumtion calculated in 1g,{1: .2g} (bits)."
      "It's because that in real world, the letters in the bigram are not independent.".format(bi_entropy, 2*mono_entropy))

The actual entropy for bigram is 7.84 (bits) which is smaller than the independent assumtion calculated in 1g, 8.4 (bits).It's because that in real world, the letters in the bigram are not independent.


In [7]:
# 1j

print("Mutual information I(L1;L2), amount of information you can get for L2(L1) by knowing L1(L2),"
      " is defined as H(L1) + H(L2) - H(L1,L2) ={0: .2g} (bits)".format(2*mono_entropy-bi_entropy))

Mutual information I(L1;L2), amount of information you can get for L2(L1) by knowing L1(L2), is defined as H(L1) + H(L2) - H(L1,L2) = 0.53 (bits)


In [8]:
# 1k
# we'll use table mono again to compute the prediction entropy based on independent letters model

word = pd.read_table('english_words.txt', header=None, sep=' ', index_col=0)
word['Prob'] = word[1]/sum(word[1])
neu_pre = mono.loc['N','Prob']*mono.loc['E','Prob']*mono.loc['U','Prob']*mono.loc['R','Prob']*mono.loc['O','Prob']*mono.loc['N','Prob']
neu_act = word.loc['NEURON','Prob']
ucsd_pre = mono.loc['U','Prob']*mono.loc['C','Prob']*mono.loc['S','Prob']*mono.loc['D','Prob']
ucsd_act = word.loc['UCSD','Prob']
print("1.The prediction value for probability of word NEURON is{0: .2g} and the actual value is{1: .2g}".format(neu_pre,neu_act))
print("2.The prediction value for probability of word UCSD is{0: .2g} and the actual value is{1: .2g}".format(ucsd_pre,ucsd_act))
print("3.The log likelihood of word NEURON is{0: .2g} and UCSD is {1: .2g}".format(np.log2(neu_act/neu_pre),np.log2(ucsd_act/ucsd_pre)))

1.The prediction value for probability of word NEURON is 7.9e-08 and the actual value is 9.4e-07
2.The prediction value for probability of word UCSD is 2.2e-06 and the actual value is 4.6e-07
3.The log likelihood of word NEURON is 3.6 and UCSD is -2.3


# 2a
$p(n=1) = \frac{1}{2}p(n=1|x=1) + \frac{1}{2}p(n=1|x=2) = \frac{1}{2}\times0.04 + \frac{1}{2} \times 0.01 = 0.025$

In [9]:
# 2b
dt = 0.01
r1 = 4
r2 = 1
P_x1 = 0.5 # P(x=1)
P_x2 = 0.5 # P(x=2)
P_n1x1 = r1*dt # P(n=1|x=1)
P_n1x2 = r2*dt # P(n=1|x=2)
P_n0x1 = 1-P_n1x1 # P(n=0|x=1) = 1 - P(n=1|x=1)
P_n0x2 = 1-P_n1x2 # P(n=0|x=2) = 1 - P(n=1|x=2)

print(pd.DataFrame([["P(x1,n0)", "P(x2,n0)", "P(n=0)"], ["P(x1,n1)", "P(x2,n1)","P(n=1)"],["P(x=1)", "P(x=2)",""]], 
                   columns=['x=1', 'x=2',""], index=['n=0', 'n=1',""]))
df = pd.DataFrame([[P_x1*P_n0x1, P_x2*P_n0x2], [P_x1*P_n1x1, P_x2*P_n1x2]], columns=['x=1', 'x=2'], index=['n=0', 'n=1'])
df

          x=1       x=2        
n=0  P(x1,n0)  P(x2,n0)  P(n=0)
n=1  P(x1,n1)  P(x2,n1)  P(n=1)
       P(x=1)    P(x=2)        


Unnamed: 0,x=1,x=2
n=0,0.48,0.495
n=1,0.02,0.005


In [10]:
# 2c
# read the table above would help

H_x = entropy(df.sum(axis=0)) #= -P_x1*np.log2(P_x1)-P_x2*np.log2(P_x2)

H_n = entropy(df.sum(axis=1))#-P_n0*np.log2(P_n0)-P_n1*np.log2(P_n1)

H_nx = entropy(df)

print("H(x) = -P(x1)log2(P(x1))-P(x2)log2(P(x2)) ={0: .3g} (bits)".format(H_x))
print("H(n) = -P(n0)log2(P(n0))-P(n1)log2(P(n1)) ={0: .3g} (bits)".format(H_n))
print("H(x,n) = -P(x1,n0)log2(P(x1,n0))-P(x2,n0)log2(P(x2,n0))-P(x1,n1)log2(P(x1,n1))-P(x2,n1)log2(P(x2,n1)) ={0: .3g} (bits)".format(H_nx))

H(x) = -P(x1)log2(P(x1))-P(x2)log2(P(x2)) = 1 (bits)
H(n) = -P(n0)log2(P(n0))-P(n1)log2(P(n1)) = 0.169 (bits)
H(x,n) = -P(x1,n0)log2(P(x1,n0))-P(x2,n0)log2(P(x2,n0))-P(x1,n1)log2(P(x1,n1))-P(x2,n1)log2(P(x2,n1)) = 1.16 (bits)


In [11]:
df.sum(axis=0)

x=1    0.5
x=2    0.5
dtype: float64

In [12]:
# 2d

print("Mutual information I(x;n) = H(x) + H(n) - H(x,n) ={0: .3g} (bits)".format(H_x+H_n-H_nx))

Mutual information I(x;n) = H(x) + H(n) - H(x,n) = 0.00712 (bits)


In [13]:
# 2e
# note that dt = 5ms now, different from previous questions
dt = 0.005
r1 = 4
r2 = 1
P_x1 = 0.5 # P(x=1)
P_x2 = 0.5 # P(x=2)
P_n1x1 = r1*dt # P(n=1|x=1)
P_n1x2 = r2*dt # P(n=1|x=2)
P_n0x1 = 1-P_n1x1 # P(n=0|x=1) = 1 - P(n=1|x=1)
P_n0x2 = 1-P_n1x2 # P(n=0|x=2) = 1 - P(n=1|x=2)

print(pd.DataFrame([["P(x1,n00)", "P(x2,n00)", "P(n=0,0)"], ["P(x1,n10)", "P(x2,n10)","P(n=1,0)"],["P(x1,n01)", "P(x2,n01)", "P(n=0,1)"], ["P(x1,n11)", "P(x2,n11)","P(n=1,1)"],["P(x=1)", "P(x=2)",""]], 
                   columns=['x=1', 'x=2',""], index=['n=(0,0)', 'n=(1,0)', 'n=(0,1)', 'n=(1,1)',""]))
df2 = pd.DataFrame([[P_x1*P_n0x1*P_n0x1, P_x2*P_n0x2*P_n0x2], [P_x1*P_n0x1*P_n1x1, P_x2*P_n0x2*P_n1x2], [P_x1*P_n1x1*P_n0x1, P_x2*P_n1x2*P_n0x2], [P_x1*P_n1x1*P_n1x1, P_x2*P_n1x2*P_n1x2]], 
                   columns=['x=1', 'x=2'], index=['n=(0,0)', 'n=(1,0)', 'n=(0,1)', 'n=(1,1)'])
df2

               x=1        x=2          
n=(0,0)  P(x1,n00)  P(x2,n00)  P(n=0,0)
n=(1,0)  P(x1,n10)  P(x2,n10)  P(n=1,0)
n=(0,1)  P(x1,n01)  P(x2,n01)  P(n=0,1)
n=(1,1)  P(x1,n11)  P(x2,n11)  P(n=1,1)
            P(x=1)     P(x=2)          


Unnamed: 0,x=1,x=2
"n=(0,0)",0.4802,0.495013
"n=(1,0)",0.0098,0.002488
"n=(0,1)",0.0098,0.002488
"n=(1,1)",0.0002,1.3e-05


In [14]:
# continue 2e
H_xn12 = entropy(df2)
print("H(x,n12) = -P(x1,n0)log2(P(x1,n0))-P(x2,n0)log2(P(x2,n0))-P(x1,n1)log2(P(x1,n1))-P(x2,n1)log2(P(x2,n1)) ={0: .5g} (bits)".format(H_xn12))

H(x,n12) = -P(x1,n0)log2(P(x1,n0))-P(x2,n0)log2(P(x2,n0))-P(x1,n1)log2(P(x1,n1))-P(x2,n1)log2(P(x2,n1)) = 1.1869 (bits)


In [15]:
# 2f

H_n12 = entropy(df2.sum(axis=1))
print("H(n12) ={0: .5} (bits)".format(H_n12))

H(n12) = 0.19388 (bits)


In [16]:
# 2g

print("Mutual information I(x;n12) = H(x) + H(n12) - H(x,n12) ={0: .3g} (bits)".format(H_x+H_n12-H_xn12))

Mutual information I(x;n12) = H(x) + H(n12) - H(x,n12) = 0.00702 (bits)


# 2h
Mutual information I(x;n) means that by knowing one variable(x or n), how much information can you get for another variable(n or x). According to the results, $I(x, n) = 0.00712$ is close to $I(x, n_{12}) = 0.00702$, it means that n and $n_{12}$ give you the same amount of information about stimulus x which is counting spikes in a single time bin (10 ms) gives the same information as counting spikes in two time bins (5 ms each).This implies the neurons encode the stimulus by rate coding.