In [2]:
import networkx as nx
import numpy as np
import pandas as pd
import datacleaning
import itertools
from nltk.probability import FreqDist
from ast import literal_eval
import seaborn as sns
import matplotlib.pyplot as plt
import random

In [3]:
def do_centrality(G: nx.Graph):
    deg = pd.DataFrame(nx.degree_centrality(G).items(), columns=['Id', 'degree_centrality'])
    # eig = pd.DataFrame(nx.eigenvector_centrality_numpy(G, weight="Weight").items(), columns=['Id', 'eigenvector_centrality'])
    merged = pd.merge(data[['Id','title']], deg, on = "Id")
    return merged

data = pd.read_csv("data_small.csv", converters={"ingredient_words": literal_eval,"instruction_words": literal_eval})
data = data.rename(columns={"Unnamed: 0":"Id"})

In [8]:
data.set_index("Id").loc["WPIBK.zCMtomTe4JVnBcdFDOCfryhly"]['instructions']

'Preheat oven to 350 degrees F (175 degrees C). Grease and flour a 9x13 inch pan. Sift together the flour, cocoa, baking soda and salt. Set aside.\nIn a large bowl, cream together the margarine and sugar until light and fluffy. Beat in the eggs one at a time, then stir in the vanilla. Beat in the flour mixture alternately with the milk, mixing just until incorporated. Pour batter into prepared pan.\nBake in the preheated oven for 40 to 45 minutes, or until a toothpick inserted into the center of the cake comes out clean. Allow to cool.\n'

In [None]:
graph_instruction = nx.read_gexf("recipe_instruction_small.gexf")
graph_ingredient = nx.read_gexf("recipe_ingredient_small.gexf")

graph_recipe_instruction_allwords = nx.read_gexf("recipe_instruction_nofilter.gexf")
graph_recipe_ingredient_allwords = nx.read_gexf("recipe_ingredient_nofilter.gexf")

graph_recipe_instruction_tfidf = nx.read_gexf("recipe_instruction_tfidf.gexf")
graph_recipe_ingredient_tfidf = nx.read_gexf("recipe_ingredient_tfidf.gexf")

graph_recipe_instruction_ranked_tfidf = nx.read_gexf("recipe_instruction_ranked_tfidf.gexf")
graph_recipe_ingredient_ranked_tfidf = nx.read_gexf("recipe_ingredient_ranked_tfidf.gexf")

In [None]:
def count_components(G: nx.graph):
    graph_CCs = sorted(nx.connected_components(G), key=len, reverse=True)

    graph_CCs_subgraphs = [G.subgraph(c).copy() for c in graph_CCs]

    lens = [len(g) for g in graph_CCs_subgraphs]
    s, count = np.unique(np.sort(lens), return_counts=True)
    return np.asarray((s,count))

def plot_components(data, axes):
    a = sns.scatterplot(x = data[0], y = data[1], ax=axes, s=10)
    a.set_xscale('log')
    a.set_yscale('log')

fig, axes = plt.subplots(1,2, figsize= (6,3), sharey=True)
plot_components(count_components(graph_ingredient), axes=axes[0])
plot_components(count_components(graph_recipe_ingredient_tfidf), axes=axes[1])
axes[0].set_title("Original Graph")
axes[1].set_title("TFIDF Graph")
axes[0].set_ylabel("Count")
axes[0].set_xlabel("Size")
axes[1].set_xlabel("Size")
fig.suptitle("Size of Connected Components using Original and TFIDF Constructions")

fig.tight_layout()
plt.savefig('figs/original_tfidf_components.pdf')

In [None]:
# (do_centrality(graph_ingredient)).sort_values(by = 'degree_centrality', ascending = False)

In [None]:
# (do_centrality(graph_instruction)).sort_values(by = 'degree_centrality', ascending = False)

In [None]:
# (do_centrality(graph_recipe_ingredient_tfidf)).sort_values(by = 'degree_centrality', ascending = False)


In [None]:
graphs = [graph_ingredient, graph_instruction, graph_recipe_ingredient_allwords, graph_recipe_instruction_allwords, graph_recipe_ingredient_tfidf, graph_recipe_instruction_tfidf, graph_recipe_ingredient_ranked_tfidf, graph_recipe_instruction_ranked_tfidf]

In [None]:
graph_pairs = itertools.combinations(graphs, r=2)

In [None]:
import scipy
names = ["Original Ingredient", "Original Instruction", "All Ingredient Words", "All Instruction Words", "TFIDF Ingredient", "TFIDF Instruction", "Ranked TFIDF Ingredient", "Ranked TFIDF Instruction"]
graph_distance = pd.DataFrame(np.zeros(shape = (len(graphs), len(graphs))), columns=names)
graph_distance.insert(0, 'i', names)
graph_distance = graph_distance.set_index('i')
for i in range(len(graphs)):
    for j in range(len(graphs)):
        if i>j:
            Mi = nx.adjacency_matrix(graphs[i])
            Mj = nx.adjacency_matrix(graphs[j])

            Mi = Mi/scipy.sparse.linalg.norm(Mi)
            Mj = Mj/scipy.sparse.linalg.norm(Mj)
            graph_distance.iloc[i,j] = scipy.sparse.linalg.norm(Mi - Mj)

#     for j in range(len(graphs)):
#         if i<j:
#             graph_distance[i,j] = 1
            # graph_distance[i,j] = 

In [None]:
fig, ax = plt.subplots(1,1, figsize= (6,6), sharey=True)
a=sns.heatmap(graph_distance, annot=True, cmap="Blues", ax=ax, cbar = False, fmt='.3g', mask=(graph_distance==0))
a.set_ylabel("")
# fig.suptitle("Distance Between Alternate Constructions")
fig.tight_layout()
plt.savefig('figs/all_graphdistance.pdf')



In [None]:
def do_centralities(G: nx.Graph):
    deg = pd.DataFrame(nx.degree_centrality(G).items(), columns=['Id', 'degree_centrality'])
    pagerank = pd.DataFrame(nx.pagerank(G, weight="Weight").items(), columns=['Id', 'pagerank_centrality'])
    betweenness = pd.DataFrame(nx.betweenness_centrality(G, weight="Weight").items(), columns=['Id', 'betweenness_centrality'])
    merged = pd.merge(data, deg, on = "Id")
    merged = pd.merge(merged, pagerank, on = "Id")
    merged = pd.merge(merged, betweenness, on = "Id")
    return merged

In [None]:
data

In [None]:
# instruction_ranked_tfidf_cent = do_centralities(graph_recipe_instruction_ranked_tfidf)

In [None]:
instructions_idf = pd.read_csv("instructions_idf.csv")
ingredients_idf = pd.read_csv("ingredients_idf.csv")

instructions_idf = instructions_idf.drop(columns= 'Unnamed: 0')
ingredients_idf = ingredients_idf.drop(columns= 'Unnamed: 0')

def calculate_idf_sum(instructions, idfs):
    instruction_words = instructions.split()
    wordset = set(idfs['word'])
    idfs = idfs.set_index('word')
    sum = 0
    for word in instruction_words:
        if word in wordset: 
            sum = sum + idfs.loc[word]
    return(sum) 

calculate_idf_sum("preheat oven degrees", instructions_idf)

def all_idf_sum(dat, idfs):
    dat['sum_idf'] = dat.apply(lambda l: calculate_idf_sum(l['clean_instructions_masked'], idfs=idfs), axis=1)
    return dat

In [None]:
# instructions_idf = pd.concat([ingredients_idf,instructions_idf])

# instruction_ranked_tfidf_cent['sum_idf'] = instruction_ranked_tfidf_cent['clean_instructions_masked']



In [None]:
# fig, axes = plt.subplots(1,1, figsize= (6,6))
# sns.scatterplot(x=instruction_ranked_tfidf_cent['betweenness_centrality'], y=instruction_ranked_tfidf_cent['sum_idf'], size = .1, alpha = .1)
# fig.tight_layout()
# instruction_ranked_tfidf_cent.to_csv('instruction_ranked_tfidf_cent.csv')
# instruction_ranked_tfidf_cent = all_idf_sum(instruction_ranked_tfidf_cent, instructions_idf)

In [None]:
instruction_ranked_tfidf_cent = pd.read_csv('instruction_ranked_tfidf_cent.csv', converters={"ingredient_words": literal_eval,"instruction_words": literal_eval})

In [None]:

instruction_ranked_tfidf_cent.sort_values(by='betweenness_centrality', ascending=False).head(10)

In [None]:
import statistics
stddv = statistics.stdev(instruction_ranked_tfidf_cent['betweenness_centrality'])
mean = statistics.mean(instruction_ranked_tfidf_cent['betweenness_centrality'])
# sns.histplot(data=instruction_ranked_tfidf_cent, x='betweenness_centrality')

A = instruction_ranked_tfidf_cent[instruction_ranked_tfidf_cent['betweenness_centrality'] < mean]
A = FreqDist(list(itertools.chain.from_iterable(A["instruction_words"])))
B = instruction_ranked_tfidf_cent[instruction_ranked_tfidf_cent['betweenness_centrality'] > mean + 4*stddv]
B = FreqDist(list(itertools.chain.from_iterable(B["instruction_words"])))

inst1 = ((pd.DataFrame({a:[b] for a, b in [item for item in A.items()]})).T.reset_index())
inst2 = ((pd.DataFrame({a:[b] for a, b in [item for item in B.items()]})).T.reset_index())

inst1 = inst1.sort_values(by=0, ascending=False).head(20)
inst2 = inst2.sort_values(by=0, ascending=False).head(20)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(6, 3))

a = sns.barplot(y=inst1[0], x=inst1['index'], ax = axes[0])
a.tick_params(axis='x', rotation=90)
a.set_xlabel("")
a.set_ylabel("")
a.set_title("Instructions")

b = sns.barplot(y=inst2[0], x=inst2['index'], ax = axes[1])
b.tick_params(axis='x', rotation=90)
b.set_xlabel("")
b.set_ylabel("")
b.set_title("Instructions")

fig.suptitle("Word Frequencies")
fig.tight_layout()

In [None]:
import statistics
stddv = statistics.stdev(instruction_ranked_tfidf_cent['betweenness_centrality'])
mean = statistics.mean(instruction_ranked_tfidf_cent['betweenness_centrality'])
# sns.histplot(data=instruction_ranked_tfidf_cent, x='betweenness_centrality')

A = instruction_ranked_tfidf_cent[instruction_ranked_tfidf_cent['betweenness_centrality'] < mean]
A = FreqDist(list(itertools.chain.from_iterable(A["instruction_words"])))
B = instruction_ranked_tfidf_cent[instruction_ranked_tfidf_cent['betweenness_centrality'] > mean + 5*stddv]
B = FreqDist(list(itertools.chain.from_iterable(B["instruction_words"])))

inst1 = ((pd.DataFrame({a:[b] for a, b in [item for item in A.items()]})).T.reset_index()).rename(columns={'index':"word",0:'count'})
inst2 = ((pd.DataFrame({a:[b] for a, b in [item for item in B.items()]})).T.reset_index()).rename(columns={'index':"word",0:'count'})

inst1 = inst1.merge(instructions_idf, on = 'word')
inst1['weighted_frequency'] = inst1['count'] * (inst1['weight']**2)
inst2 = inst2.merge(instructions_idf, on = 'word')
inst2['weighted_frequency'] = inst2['count'] * (inst2['weight']**2)

inst1 = inst1.sort_values(by='weighted_frequency', ascending=False).head(20)
inst2 = inst2.sort_values(by='weighted_frequency', ascending=False).head(20)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(6, 3))

a = sns.barplot(y=inst1['weighted_frequency'], x=inst1['word'], ax = axes[0])
a.tick_params(axis='x', rotation=90)
a.set_xlabel("")
a.set_ylabel("")
a.set_title("Instructions")

b = sns.barplot(y=inst2['weighted_frequency'], x=inst2['word'], ax = axes[1])
b.tick_params(axis='x', rotation=90)
b.set_xlabel("")
b.set_ylabel("")
b.set_title("Instructions")

fig.suptitle("Word Frequencies")
fig.tight_layout()

In [None]:
inst_communities = nx.community.louvain_communities(graph_recipe_instruction_ranked_tfidf, resolution=2,seed=2024)
ingr_communities = nx.community.louvain_communities(graph_recipe_ingredient_ranked_tfidf, resolution=2,seed=2024)
# for level in inst_communities:
#     print(sorted([len(com) for com in level])[::-1])

In [None]:
inst_communities = sorted(inst_communities, key=len)[::-1]
ingr_communities = sorted(ingr_communities, key=len)[::-1]

In [None]:
([len(community) for community in inst_communities])

In [None]:
# datacleaning.data_for_nodes(inst_communities[0])
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
n=10
k=5
def title_dist_for_nodes(nodes, axes):
    dats = datacleaning.data_for_nodes(nodes)
    dats['title_words'] = dats['title'].replace(r'[^a-zA-Z\s]', '', regex=True)
    dats['title_words'] = dats['title_words'].str.lower()
    dats['title_words'] = dats['title_words'].str.split()
    
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if len(word) > 3])
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if word not in stop_words])
    title_words = list(itertools.chain.from_iterable(dats["title_words"]))
    title_freqs = pd.DataFrame(sorted(FreqDist(title_words).most_common(k), key=lambda x: x[1], reverse=True))
    sns.barplot(x=title_freqs[0], y=title_freqs[1]/np.linalg.norm(title_freqs[1]), ax= axes)
    return title_words


fig, ax = plt.subplots(1,n, figsize= (12,3), sharey=True)
i = 1
plt_i = 0
for community in inst_communities:
    if len(community) < 100:
        i = i + 1
        continue
    if plt_i >= n:
        break

    title_dist_for_nodes(community, ax[plt_i])
    ax[plt_i].tick_params('x', labelrotation=90)
    ax[plt_i].set_xlabel("Community " + str(i) + "\n n = " + str(len(community)))
    ax[plt_i].set_ylabel("")

    plt_i = plt_i + 1
    i = i + 1

fig.align_xlabels()
fig.tight_layout()



In [None]:
n=10
k=5
def title_dist_for_nodes(nodes, axes):
    dats = datacleaning.data_for_nodes(nodes)
    dats['title_words'] = dats['title'].replace(r'[^a-zA-Z\s]', '', regex=True)
    dats['title_words'] = dats['title_words'].str.lower()
    dats['title_words'] = dats['title_words'].str.split()
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if len(word) > 3])
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if word not in stop_words])
    
    title_words = list(itertools.chain.from_iterable(dats["title_words"]))
    title_freqs = pd.DataFrame(list(FreqDist(title_words).items()), columns = ["word","frequency"])

    title_freqs['weighted_frequency'] = title_freqs['frequency']
    title_freqs = title_freqs.sort_values(by = 'weighted_frequency', ascending= False).head(5)

    sns.barplot(x=title_freqs['word'], y=title_freqs['weighted_frequency']/np.linalg.norm(title_freqs['weighted_frequency']), ax= axes, errorbar=('ci', 0))
    return title_words

def plot_communities(communities):
    fig, ax = plt.subplots(2,int(n/2), figsize= (7,6), sharey=True)
    i = 1
    plt_i = 0
    plt_j = 0
    for community in communities:
        if plt_i + int(n/2)*plt_j >= n:
            break
        if plt_i == int(n/2):
            plt_j = 1
            plt_i = 0
        title_dist_for_nodes(community, ax[plt_j, plt_i])
        ax[plt_j, plt_i].tick_params('x', labelrotation=90)
        ax[plt_j, plt_i].set_title("Community " + str(i) + "\n n = " + str(len(community)))
        ax[plt_j, plt_i].set_ylabel("")
        ax[plt_j, plt_i].set_xlabel("")

        plt_i = plt_i + 1
        i = i + 1
    fig.text(-.02, 0.5, 'Frequency', va='center', rotation='vertical', size='large')
    # fig.suptitle("Title Word Frequency by Instruction Community")
    fig.align_xlabels()
    fig.tight_layout()


plot_communities(inst_communities)

plt.savefig('figs/tfidf_instcommunity_titles.pdf')

In [None]:
[len(community) for community in inst_communities]

In [None]:
n=10
k=5
def title_dist_for_nodes(nodes, axes):
    dats = datacleaning.data_for_nodes(nodes)
    dats['title_words'] = dats['title'].replace(r'[^a-zA-Z\s]', '', regex=True)
    dats['title_words'] = dats['title_words'].str.lower()
    dats['title_words'] = dats['title_words'].str.split()
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if len(word) > 3])
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if word not in stop_words])
    
    title_words = list(itertools.chain.from_iterable(dats["title_words"]))
    title_freqs = pd.DataFrame(list(FreqDist(title_words).items()), columns = ["word","frequency"])

    title_freqs['weighted_frequency'] = title_freqs['frequency']
    title_freqs = title_freqs.sort_values(by = 'weighted_frequency', ascending= False).head(5)

    sns.barplot(x=title_freqs['word'], y=title_freqs['weighted_frequency']/np.linalg.norm(title_freqs['weighted_frequency']), ax= axes, errorbar=('ci', 0))
    return title_words

def plot_communities(communities):
    fig, ax = plt.subplots(2,int(n/2), figsize= (7,6), sharey=True)
    i = 1
    plt_i = 0
    plt_j = 0
    for community in communities:
        if plt_i + int(n/2)*plt_j >= n:
            break
        if plt_i == int(n/2):
            plt_j = 1
            plt_i = 0
        title_dist_for_nodes(community, ax[plt_j, plt_i])
        ax[plt_j, plt_i].tick_params('x', labelrotation=90)
        ax[plt_j, plt_i].set_title("Community " + str(i) + "\n n = " + str(len(community)))
        ax[plt_j, plt_i].set_ylabel("")
        ax[plt_j, plt_i].set_xlabel("")

        plt_i = plt_i + 1
        i = i + 1
    fig.text(-.02, 0.5, 'Frequency', va='center', rotation='vertical', size='large')
    # fig.suptitle("Title Word Frequency by Ingredient Community")
    fig.align_xlabels()
    fig.tight_layout()

# why write functions if you just copy them and change the definition? who can stop me
plot_communities(ingr_communities)

plt.savefig('figs/tfidf_ingrcommunity_titles.pdf')

In [None]:
fig, ax = plt.subplots(2,3, figsize= (10,7), sharey=True)


com = 13
a=(1,2)
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com]['average'], 4)))

com = 46
a=(1,0)
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com]['average'], 4)))

com = 14
a=(0,2)
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com]['average'], 4)))

com = 45
a=(1,1)
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com]['average'], 4)))

com = 47
a=(0,1)
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com]['average'], 4)))

com = 48
a=(0,0)
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com]['average'], 4)))

for i in [(0,0),(0,1),(0,2),(1,0),(1,1),(1,2)]:
    ax[i].set_xlabel("")

ax[0,0].set_ylabel("Frequency")
ax[1,0].set_ylabel("Frequency")
plt.tight_layout()
plt.savefig('figs/tfidf_instcommunity_centrality_titles.pdf')

In [None]:
[48,47,14,46,45,13]
# datacleaning.data_for_nodes(inst_communities[48 - 1])

In [None]:
com = 13
a=1
fig, ax = plt.subplots(1,6, figsize= (12,3), sharey=True)
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com - 1]['average'], 4)))

com = 46
a=4
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com - 1]['average'], 4)))

com = 14
a=5
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com - 1]['average'], 4)))

com = 45
a=2
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com - 1]['average'], 4)))

com = 47
a=3
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com - 1]['average'], 4)))

com = 48
a=0
title_dist_for_nodes(inst_communities[com - 1], ax[a])
ax[a].tick_params('x', labelrotation=90)
ax[a].set_title("Community " + str(com) + "\n n = " + str(len(inst_communities[com - 1])) + "\n c = " + str(round(inst_communities_df.loc[com - 1]['average'], 4)))

In [None]:
n=10
k=5
def title_dist_for_nodes(nodes, axes, column, weights):
    dats = datacleaning.data_for_nodes(nodes)
    # dats['title_words'] = dats['title'].replace(r'[^a-zA-Z\s]', '', regex=True)
    # dats['title_words'] = dats['title_words'].str.lower()
    # dats['title_words'] = dats['title_words'].str.split()
    dats['title_words'] = dats[column]
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if len(word) > 3])
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if word not in stop_words])
    
    title_words = list(itertools.chain.from_iterable(dats["title_words"]))
    title_freqs = pd.DataFrame(list(FreqDist(title_words).items()), columns = ["word","frequency"])
    
    title_freqs = title_freqs.merge(weights, on = 'word')

    title_freqs['weighted_frequency'] = title_freqs['frequency'] * title_freqs['weight']
    title_freqs = title_freqs.sort_values(by = 'weighted_frequency', ascending = False).head(5)

    sns.barplot(x=title_freqs['word'], y=title_freqs['weighted_frequency']/np.linalg.norm(title_freqs['weighted_frequency']), ax= axes, errorbar=('ci', 0))
    return title_words

def plot_communities(communities, column, weights, title):
    fig, ax = plt.subplots(2,int(n/2), figsize= (7,6), sharey=True)
    i = 1
    plt_i = 0
    plt_j = 0
    for community in communities:
        if plt_i + int(n/2)*plt_j >= n:
            break
        if plt_i == int(n/2):
            plt_j = 1
            plt_i = 0
        title_dist_for_nodes(community, ax[plt_j, plt_i], column, weights)
        ax[plt_j, plt_i].tick_params('x', labelrotation=90)
        ax[plt_j, plt_i].set_title("Community " + str(i) + "\n n = " + str(len(community)))
        ax[plt_j, plt_i].set_ylabel("")
        ax[plt_j, plt_i].set_xlabel("")

        plt_i = plt_i + 1
        i = i + 1
    fig.text(-.02, 0.5, 'Weighted Count', va='center', rotation='vertical', size='large')
    # fig.suptitle(title)
    fig.align_xlabels()
    fig.tight_layout()

plot_communities(inst_communities,'instruction_words', instructions_idf,title = "Instruction Words by Instruction Community")

plt.savefig('figs/tfidf_instcommunity_instructions.pdf')

In [None]:
plot_communities(inst_communities,'ingredient_words', ingredients_idf,title = "Ingredient Words by Instruction Community")
plt.savefig('figs/tfidf_instcommunity_ingredients.pdf')

In [None]:
def count_communities(communities):
    coms = sorted(communities, key=len, reverse=True)
    lens = [len(g) for g in coms]
    # s, count = np.unique(np.sort(lens), return_counts=True)
    return lens

def plot_components(data, axes):
    a = sns.histplot(x = data, ax=axes, binwidth=25)

fig, axes = plt.subplots(1,2, figsize= (6,3), sharex=True, sharey=True)
plot_components(count_communities(inst_communities), axes=axes[0])
plot_components(count_communities(ingr_communities), axes=axes[1])
axes[0].set_title("Instruction Communities")
axes[1].set_title("Ingredient Communities")
axes[0].set_ylabel("Count")
axes[0].set_xlabel("Size")
axes[1].set_xlabel("Size")
# fig.suptitle("Size of Communities in TFIDF Graphs")

fig.tight_layout()
plt.savefig('figs/tfidf_community_sizes.pdf')

In [None]:
plot_communities(ingr_communities,'instruction_words', instructions_idf,title = "Ingredient Words by Instruction Community")
plt.savefig('figs/tfidf_ingrcommunity_instructions.pdf')

In [None]:
plot_communities(ingr_communities,'ingredient_words', ingredients_idf,title = "Ingredient Words by Instruction Community")
plt.savefig('figs/tfidf_ingrcommunity_ingredients.pdf')

In [None]:
len(ingr_communities) - len(inst_communities)

In [None]:
instruction_ranked_tfidf_cent.set_index('Id')

In [None]:
i = 1
inst_communities_df = {}
inst_communities_size = {}
com_betweenness = {}
for com in inst_communities:
    inst_communities_df[i] = com
    inst_communities_size[i] = len(com)
    com_data = pd.DataFrame([rec for rec in com]).rename(columns={0: "Id"}).set_index('Id').join(instruction_ranked_tfidf_cent.set_index('Id'))['betweenness_centrality']
    com_betweenness[i] = (sum(com_data)/len(com), max(com_data))
    i = i+1

inst_communities_df = pd.DataFrame(inst_communities_df.items(),columns=['community_num','nodes']).set_index('community_num')


inst_communities_size = pd.DataFrame(inst_communities_size.items(),columns=['community_num','size']).set_index('community_num')

inst_communities_df = inst_communities_df.join(inst_communities_size)
com_betweenness = pd.DataFrame(com_betweenness.items(), columns=['community_num', 'avg_betweenness']).set_index('community_num').sort_values(by='avg_betweenness', ascending = False)
com_betweenness = pd.DataFrame(com_betweenness['avg_betweenness'].tolist(), index=com_betweenness.index).rename(columns = {0:'average',1:'max'})
inst_communities_df = inst_communities_df.join(com_betweenness)

In [None]:
inst_communities_df['color'] = inst_communities_df.index.isin([48,47,14,46,45,13])

In [None]:
inst_communities_df.sort_values(by='max', ascending=False)

sns.scatterplot(data=inst_communities_df, x='size', y='average', hue = 'color',legend=False)
plt.xlabel("Community Size")
plt.ylabel("Average Betweenness")
plt.savefig('figs/tfidf_instcommunity_betweenness.pdf')


In [None]:
A = inst_communities_df[inst_communities_df['size']>48]

A = A[A['size']<54]

A = A.reset_index()

A.sort_values('average', ascending=False)

In [None]:
A = inst_communities_df[inst_communities_df['size']>100]

A = A[A['size']<115]

A = A.reset_index()

A.sort_values('average', ascending=False)

In [None]:
inst_communities_df.sort_values(by='average', ascending=False)

In [None]:

fig, ax = plt.subplots(1,4, figsize= (12,3), sharey=True)

def title_dist_for_nodes(nodes, axes):
    dats = datacleaning.data_for_nodes(nodes)
    dats['title_words'] = dats['title'].replace(r'[^a-zA-Z\s]', '', regex=True)
    dats['title_words'] = dats['title_words'].str.lower()
    dats['title_words'] = dats['title_words'].str.split()
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if len(word) > 3])
    dats['title_words'] = dats['title_words'].apply(lambda x: [word for word in x if word not in stop_words])
    
    title_words = list(itertools.chain.from_iterable(dats["title_words"]))
    title_freqs = pd.DataFrame(list(FreqDist(title_words).items()), columns = ["word","frequency"])

    title_freqs['weighted_frequency'] = title_freqs['frequency']
    
    title_freqs = title_freqs.sort_values(by = 'weighted_frequency', ascending= False)
    print(title_freqs)
    title_freqs = title_freqs.head(10)
    sns.barplot(x=title_freqs['word'], y=title_freqs['weighted_frequency']/np.linalg.norm(title_freqs['weighted_frequency']), ax= axes, errorbar=('ci', 0))
    axes.tick_params('x', labelrotation=90)
    return title_words


title_dist_for_nodes(inst_communities_df.loc[1]['nodes'], ax[0])
title_dist_for_nodes(inst_communities_df.loc[7]['nodes'], ax[1])
title_dist_for_nodes(inst_communities_df.loc[18]['nodes'], ax[2])
title_dist_for_nodes(inst_communities_df.loc[33]['nodes'], ax[3])

# inst_communities_df.loc[list(com_betweenness.index)[1]]['nodes']

In [None]:
datacleaning.data_for_nodes(inst_communities_df.loc[33]['nodes'])

In [None]:
i = 1
thing = {}
for com in inst_communities:
    thing.update({movie:i for movie in com})
    i = i+1

nx.set_node_attributes(graph_recipe_instruction_ranked_tfidf, thing, "class")


i = 1
thing = {}
for com in ingr_communities:
    thing.update({movie:i for movie in com})
    i = i+1

nx.set_node_attributes(graph_recipe_ingredient_ranked_tfidf, thing, "class")


In [None]:
nx.write_gexf(graph_recipe_instruction_ranked_tfidf, "recipe_instruction_ranked_tfidf.gexf")

nx.write_gexf(graph_recipe_ingredient_ranked_tfidf, "recipe_ingredient_ranked_tfidf.gexf")

In [None]:
G_mm = graph_recipe_instruction_ranked_tfidf

merged_G_mm = nx.Graph()
added_nodes = {}
for node, data in G_mm.nodes(data = True):
    if data["class"] not in added_nodes:
        new_node = data["class"]
        added_nodes[data["class"]] = new_node
        merged_G_mm.add_node(new_node, internal_edges = 0)
    else:   
        new_node = added_nodes[data["class"]]
    neighbors = nx.all_neighbors(G_mm, node)
    for neighbor in neighbors:
        if (G_mm.nodes[neighbor]["class"]) == new_node:
            updated_internal_edges = merged_G_mm.nodes[data["class"]].get("internal_edges") + (G_mm.get_edge_data(node, neighbor)).get("weight")
            nx.set_node_attributes(merged_G_mm, {data["class"]: updated_internal_edges},"internal_edges" )

        elif merged_G_mm.get_edge_data(G_mm.nodes[neighbor]["class"], new_node) == None:
            merged_G_mm.add_edge(G_mm.nodes[neighbor]["class"], new_node)
            nx.set_edge_attributes(merged_G_mm, {(G_mm.nodes[neighbor]["class"],new_node):{"weight": (G_mm.get_edge_data(node, neighbor)).get("weight")}})

        elif merged_G_mm.get_edge_data(G_mm.nodes[neighbor]["class"], new_node) != None:
            updated_edge_weight = merged_G_mm.get_edge_data(G_mm.nodes[neighbor]["class"], new_node).get("weight") + (G_mm.get_edge_data(node, neighbor)).get("weight")
            nx.set_edge_attributes(merged_G_mm, {(G_mm.nodes[neighbor]["class"], new_node) : {"weight" : updated_edge_weight}})


In [None]:
nx.write_gexf(merged_G_mm, "com_recipe_instruction_ranked_tfidf.gexf")

In [None]:
# An absurdly dumb way to calculate adjacency matrix difference for particular nodes
def dict_distance(dict1, dict2):
  output = 0
  for key in set(dict1.keys()) | set(dict2.keys()):
    if key in dict1 and key in dict2:
      output += abs(dict1[key] - dict2[key])
    elif key in dict1:
      output += abs(dict1[key])
    else:
      output += abs(dict2[key])
  return output

def node_distance(node, graph1, graph2):
    n1 = {n:graph1[node][n]["weight"] for n in graph1[node]}
    n2 = {n:graph2[node][n]["weight"] for n in graph2[node]}
    return dict_distance(n1, n2)

# node_distance(list(graph_recipe_instruction_ranked_tfidf.nodes)[1], graph_recipe_ingredient_ranked_tfidf, graph_recipe_instruction_ranked_tfidf)
from random import sample

def total_distance(G1, G2):
  node_distances = {}
  for node in list(G1.nodes):
      node_distances[node] = (node_distance(node, G1, G2))
  node_distances = pd.DataFrame(node_distances.items(), columns=['Id','distance'])
  # print(node_distances)
  return sum(node_distances['distance'])/5000

In [None]:
print(total_distance(graph_instruction, graph_recipe_ingredient_ranked_tfidf))

In [None]:
print(total_distance(graph_instruction, graph_recipe_instruction_ranked_tfidf))

In [None]:
node_distances.set_index('Id').join(data.set_index('Id')).sort_values(by='distance', ascending = False)

In [12]:
Q = datacleaning.data_for_nodes(set(["hLowosT.nUfY72goOAjACSHrGaDZtjW"]))

In [13]:
(Q['instruction_words'])[0]

  (Q['instruction_words'])[0]


['whisk',
 'together',
 'bowl',
 'gradually',
 'whisk',
 'alternating',
 'make',
 'smooth',
 'batter',
 'whisk',
 'batter',
 'refrigerate',
 'least',
 'hour',
 'pour',
 'small',
 'bowl',
 'heat',
 'small',
 'skillet',
 'medium',
 'heat',
 'brush',
 'skillet',
 'using',
 'pastry',
 'brush',
 'ladle',
 'enough',
 'batter',
 'cover',
 'bottom',
 'skillet',
 'swirl',
 'skillet',
 'cover',
 'bottom',
 'completely',
 'cook',
 'crepe',
 'turns',
 'golden',
 'brown',
 'bottom',
 'flip',
 'cook',
 'side',
 'small',
 'brown',
 'spots',
 'repeat',
 'remaining',
 'batter',
 'brushing',
 'skillet',
 'prevent',
 'sticking',
 'set',
 'cooked',
 'aside',
 'layers',
 'waxed',
 'paper',
 'preheat',
 'oven',
 'degrees',
 'degrees',
 'mix',
 'bowl',
 'thoroughly',
 'combined',
 'spread',
 'bottom',
 'xinch',
 'baking',
 'dish',
 'place',
 'crepe',
 'onto',
 'work',
 'surface',
 'spoon',
 'line',
 'center',
 'crepe',
 'roll',
 'crepe',
 'set',
 'pan',
 'repeat',
 'remaining',
 'laying',
 'filled',
 'pan',


In [None]:
data.iloc[2024][]

In [None]:
data