In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [4]:
market_df = pd.read_csv('../Evaluation of the frequent itemsets/shopping_basket.csv')
market_frequent_itemsets = apriori(market_df, min_support=0.005, use_colnames=True)

In [5]:
interestingness_measurements = association_rules(market_frequent_itemsets, metric="lift", min_threshold=0)
interestingness_measurements.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(burgers),(almonds),0.087188,0.020397,0.005199,0.059633,2.923577,0.003421,1.041724
1,(almonds),(burgers),0.020397,0.087188,0.005199,0.254902,2.923577,0.003421,1.225089
2,(chocolate),(almonds),0.163845,0.020397,0.005999,0.036615,1.795099,0.002657,1.016834
3,(almonds),(chocolate),0.020397,0.163845,0.005999,0.294118,1.795099,0.002657,1.184553
4,(eggs),(almonds),0.179709,0.020397,0.006532,0.03635,1.782108,0.002867,1.016555


we are going to implement interestingness measurement, the (full) mutual information, and add a 'mutual information' column to the data frame. The measurement is defined as

$$I(X;Y)=\sum_{x\in\mathcal{X}}\sum_{y\in\mathcal{Y}} P(X=x, Y=y)\log_2\frac{P(X=x,Y=y)}{P(X=x)P(P=y)}.$$

Note that the logorithm requirest that the joint probability $P(X=x, Y=y) > 0$, which does not hold for some $(x, y)$. However, since we know that when $P(X=x, Y=y) = 0$, it would not contribute to the sum, you may assume $P(X=x, Y=y)\log_2\frac{P(X=x,Y=y)}{P(X=x)P(P=y)} = 0$ in that case. 

$x$, $y$ are possible values of $X$ and $Y$; in the case of appearance or absence of an item, 1 or 0. Therefore, we need to consider all possible combinations of $x$ and $y$, that is, $(X=1, Y=1)$, $(X=1, Y=0)$, $(X=0, Y=1)$, $(X=0, Y=0)$.

In [6]:
def mi(antecedent_support, consequent_support, support):
       
    px1 = antecedent_support
    px0 = 1 - antecedent_support
    py1 = consequent_support
    py0 = 1 - consequent_support
    
    px1y1 = support
    px1y0 = px1 - px1y1
    px0y1 = py1 - px1y1
    px0y0 = 1 - px1 - py1 + px1y1
    
    
        
    #one Zero    
    if px1y1 == 0 and px1y0 != 0 and px0y1 != 0 and px0y0 != 0:
        mutual_information = px1y0*np.log2(px1y0/(px1*py0))+px0y1*np.log2(px0y1/(px0*py1))+px0y0*np.log2(px0y0/(px0*py0))
    elif px1y1 != 0 and px1y0 == 0 and px0y1 != 0 and px0y0 != 0:
        mutual_information = px1y1*np.log2(px1y1/(px1*py1))+px0y1*np.log2(px0y1/(px0*py1))+px0y0*np.log2(px0y0/(px0*py0))
    elif px1y1 != 0 and px1y0 != 0 and px0y1 == 0 and px0y0 != 0:
        mutual_information = px1y1*np.log2(px1y1/(px1*py1))+px1y0*np.log2(px1y0/(px1*py0))+px0y0*np.log2(px0y0/(px0*py0))
    elif px1y1 != 0 and px1y0 != 0 and px0y1 == 0 and px0y0 != 0:
        mutual_information = px1y1*np.log2(px1y1/(px1*py1))+px1y0*np.log2(px1y0/(px1*py0))+px0y1*np.log2(px0y1/(px0*py1))
        
    #two Zero
    elif px1y1 == 0 and px1y0 == 0 and px0y1 != 0 and px0y0 != 0:
        mutual_information = px0y1*np.log2(px0y1/(px0*py1))+px0y0*np.log2(px0y0/(px0*py0))
    elif px1y1 == 0 and px1y0 != 0 and px0y1 == 0 and px0y0 != 0:
        mutual_information = px1y0*np.log2(px1y0/(px1*py0))+px0y0*np.log2(px0y0/(px0*py0))
    elif px1y1 == 0 and px1y0 != 0 and px0y1 != 0 and px0y0 == 0:
        mutual_information = px1y0*np.log2(px1y0/(px1*py0))+px0y1*np.log2(px0y1/(px0*py1))
    elif px1y1 != 0 and px1y0 == 0 and px0y1 == 0 and px0y0 != 0:
        mutual_information = px1y1*np.log2(px1y1/(px1*py1))+px0y0*np.log2(px0y0/(px0*py0))
    elif px1y1 != 0 and px1y0 == 0 and px0y1 != 0 and px0y0 == 0:
        mutual_information = px1y1*np.log2(px1y1/(px1*py1))+px0y1*np.log2(px0y1/(px0*py1))
    elif px1y1 != 0 and px1y0 != 0 and px0y1 == 0 and px0y0 == 0:
        mutual_information = px1y1*np.log2(px1y1/(px1*py1))+px1y0*np.log2(px1y0/(px1*py0))
    
    #three Zero
    elif px1y1 == 0 and px1y0 == 0 and px0y1 == 0 and px0y0 != 0:
        mutual_information = px0y0*np.log2(px0y0/(px0*py0))
    elif px1y1 == 0 and px1y0 == 0 and px0y1 != 0 and px0y0 == 0:
        mutual_information = px0y1*np.log2(px0y1/(px0*py1))       
    elif px1y1 == 0 and px1y0 != 0 and px0y1 == 0 and px0y0 == 0:
        mutual_information = px1y0*np.log2(px1y0/(px1*py0))     
    elif px1y1 != 0 and px1y0 == 0 and px0y1 == 0 and px0y0 == 0:
        mutual_information = px1y1*np.log2(px1y1/(px1*py1))
        
    #four Zero
    elif px1y1 == 0 and px1y0 == 0 and px0y1 == 0 and px0y0 == 0:
        mutual_information = 0
    
    #zero Zero
    else:
        mutual_information = px1y1*np.log2(px1y1/(px1*py1))+px1y0*np.log2(px1y0/(px1*py0))+px0y1*np.log2(px0y1/(px0*py1))+px0y0*np.log2(px0y0/(px0*py0))
    
    return mutual_information      

In [7]:
interestingness_measurements['mi'] = \
    interestingness_measurements.apply(lambda pair: mi(pair['antecedent support'], 
                                              pair['consequent support'], 
                                              pair['support']),
                                       axis=1)
interestingness_measurements.sort_values('mi', ascending=False).head(n=5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,mi
676,(spaghetti),(ground_beef),0.17411,0.098254,0.039195,0.225115,2.291162,0.022088,1.163716,0.022631
677,(ground_beef),(spaghetti),0.098254,0.17411,0.039195,0.398915,2.291162,0.022088,1.373997,0.022631
655,(ground_beef),(herb_&_pepper),0.098254,0.04946,0.015998,0.162822,3.291994,0.011138,1.13541,0.014731
654,(herb_&_pepper),(ground_beef),0.04946,0.098254,0.015998,0.32345,3.291994,0.011138,1.33286,0.014731
1772,"(mineral_water, spaghetti)",(ground_beef),0.059725,0.098254,0.017064,0.285714,2.907928,0.011196,1.262445,0.013063


Jaccard similarity is a simple but powerful measurement of itemset similarity, defined as follows:
$$\text{Jaccard_similarity(A, B)} = \frac{|A\cap B|}{|A\cup B|}$$

In [8]:
def jaccard_similarity(set_a, set_b):
    uni = set_a|set_b
    inte = set_a & set_b
    return len(inte)/len(uni)

In [10]:

tweets_df = pd.read_csv("../Evaluation of the frequent itemsets/food_drink_emoji_tweets.txt", sep="\t", header=None)
tweets_df.columns = ['text']

emoji_list = "🍇🍈🍉🍊🍋🍌🍍🥭🍎🍏🍐🍑🍒🍓🥝🍅🥥🥑🍆🥔🥕🌽🌶🥒🥬🥦🍄🥜🌰🍞🥐🥖🥨🥯🥞🧀🍖🍗🥩🥓🍔🍟🍕🌭🥪🌮🌯🥙🥚🍳🥘🍲🥣🥗🍿🧂🥫🍱🍘🍙🍚🍛🍜🍝🍠🍢🍣🍤🍥🥮🍡🥟🥠🥡🦀🦞🦐🦑🍦🍧🍨🍩🍪🎂🍰🧁🥧🍫🍬🍭🍮🍯🍼🥛☕🍵🍶🍾🍷🍸🍹🍺🍻🥂🥃"
emoji_set = set(emoji_list)

def extract_uniq_emojis(text):
    return 

tweets_df['emojis'] = tweets_df.text.apply(lambda text:np.unique([chr for chr in text if chr in emoji_set]))

tweets_df['jaccard'] = tweets_df.emojis.apply(lambda x:jaccard_similarity(set(tweets_df.loc[0].emojis), set(x)))
tweets_df.sort_values('jaccard',ascending=False).head(n=10)

Unnamed: 0,text,emojis,jaccard
0,RT @CalorieFixess: 🍗🌯🍔🍒 400 Calories https://t...,"[🌯, 🍒, 🍔, 🍗]",1.0
6800,RT @levelscafeabuja: Chow! 🤩💦🍗🌯🍔 #LevelsCafeAb...,"[🌯, 🍔, 🍗]",0.75
9158,RT @AStateRedWolves: ✅ Countertops: Installed ...,"[🍔, 🍗]",0.5
777,@SunnyAnderson @rosannascotto I don’t think KF...,"[🍔, 🍗]",0.5
6226,RT @yooojax: 3. Free Food 🍗🍔 will ready by 2PM...,"[🍔, 🍗]",0.5
7428,Kicking off the weekend with a cheeky BBQ? Her...,"[🍔, 🍗]",0.5
7877,@tafarireid07 Did you say bbq? 🔥🍔🍗🚙,"[🍔, 🍗]",0.5
5328,RT @MAPSTTU: @EtaUpAlphas is starting the seme...,"[🍔, 🍗]",0.5
5334,RT @thatssochioma: You don’t want to miss this...,"[🍔, 🍗]",0.5
7788,I’m hungry for chicken 🍗 wings or burrito 🌯,"[🌯, 🍗]",0.5
