In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx

colors_6 = ["#7d1407", "#c9583a", "#f5a659", "#62c6da", "#1a85b3", "#085991"]
colors_5 = ["#c9583a", "#f5a659", "#62c6da", "#1a85b3", "#085991"]

In [51]:
def plot_through_time(info1, info2, info3):
    '''info1 must be 'sender_dpt' or 'receiver_dpt', info2 must be 'sender_loc' or 'receiver_loc' and info3 must be 'sender_age' or 'receiver_age' '''
    messages_dates_dpt = messages[[info1,'timestamp']]
    messages_dates_dpt.set_index('timestamp', inplace = True)
    data_dpt = messages_dates_dpt.groupby([messages_dates_dpt.index.month, info1]).value_counts().to_frame()
    data_dpt.reset_index(inplace = True)
    data_dpt.rename(columns = {'timestamp' : 'month', 0:'count'}, inplace = True)

    messages_dates_loc = messages[[info2,'timestamp']]
    messages_dates_loc.set_index('timestamp', inplace = True)
    data_loc = messages_dates_loc.groupby([messages_dates_loc.index.month, info2]).value_counts().to_frame()
    data_loc.reset_index(inplace = True)
    data_loc.rename(columns = {'timestamp' : 'month', 0:'count'}, inplace = True)

    messages_dates_age = messages[[info3,'timestamp']]
    messages_dates_age.set_index('timestamp', inplace = True)
    data_age = messages_dates_age.groupby([messages_dates_age.index.month, info3]).value_counts().to_frame()
    data_age.reset_index(inplace = True)
    data_age.rename(columns = {'timestamp' : 'month', 0:'count'}, inplace = True)

    fig, axes = plt.subplots(1,3, figsize=(15,5))

    sns.lineplot(data = data_dpt, ax=axes[0], x = 'month', y='count', hue = info1, palette = "RdBu")
    sns.lineplot(data = data_loc, ax=axes[1], x = 'month', y='count', hue = info2, palette = "RdBu")
    sns.lineplot(data = data_age, ax=axes[2], x = 'month', y='count', hue = info3, palette = "RdBu")
    plt.show()

In [52]:
def diff_countries():
	messages_same_country = messages[messages['sender_loc'] != messages['receiver_loc']]
	messages_dates_loc_country = messages_same_country[['sender_loc','timestamp']]
	messages_dates_loc_country.set_index('timestamp', inplace = True)
	data_loc_country = messages_dates_loc_country.groupby([messages_dates_loc_country.index.month, 'sender_loc']).value_counts().to_frame()
	data_loc_country.reset_index(inplace = True)
	data_loc_country.rename(columns = {'timestamp' : 'month', 0:'count'}, inplace = True)
	ax = sns.lineplot(data = data_loc_country, x = 'month', y='count', hue = 'sender_loc', palette = "RdBu")
	ax.set_title('Messages sent where the sender and the receiver\n are from the different countries')
	plt.show()

The x dataframe created is the df of the id's of sender and receiver, we got the duplicates off to count only connections, as mentioned before. The y dataframe is a df where the receiver's id is on the left, so then I add to the x df and get a connection df where everyone on the left column (A) has a connection to the ones on the right column (B).

In [2]:
def emp_interactions(df, string = ''):
    sns.set_style('whitegrid')
    x = df[['sender','receiver']]
    x.drop_duplicates(inplace = True)
    x_graph = x['sender'].value_counts(sort=True)[0:5].to_frame()
    order = x_graph.index
    fig, axes = plt.subplots(1,2, figsize=(15,5))
    
    ax = sns.barplot(data = x_graph, y = 'sender', ax=axes[0], x = x_graph.index, order = order)
    ax.set(xlabel="Employee ID", ylabel = "messages send to different employees")
    title1 = 'Top-5 employees that send messages to\n different employees ' + string
    ax.set_title(title1)
    
    y = df[['sender','receiver']]
    y.rename(columns = {'receiver' : 'sender', 'sender' : 'receiver'}, inplace = True)
    connections = x.append(y)
    connections.drop_duplicates(inplace = True)
    connections.rename(columns = {'sender' : 'A', 'receiver' : 'B'}, inplace = True)
    connect_graph = connections['A'].value_counts(sort=True)[0:5].to_frame()
    order = connect_graph.index
    ax = sns.barplot(data = connect_graph, y = 'A', ax = axes[1], x = connect_graph.index, order = order)
    ax.set(xlabel="Employee ID", ylabel = "Connections")
    title2 = 'Top-5 connected employees ' + string
    ax.set_title(title2)
    plt.show()

In [64]:
def employee_z_through_time(z):
    '''Plots a graph of the number of messages sent by the employee z(integer) on months of the data'''
    sns.set_style('whitegrid')
    fig, ax = plt.subplots(figsize=(5,5))
    emp = messages[messages['sender'] == z]
    x = emp[['sender', 'timestamp']]
    x.set_index('timestamp', inplace = True)
    x_graph = x.groupby(x.index.month).value_counts().to_frame()
    x_graph.reset_index(inplace = True)
    x_graph.rename(columns = {'timestamp' : 'month', 0:'count'}, inplace = True)
    ax = sns.barplot(data = x_graph, x = 'month', y='count', palette = "RdBu")
    title = 'Messages sent by employee ' + str(z) + ' through time'
    ax.set_title(title)
    plt.show()

In [55]:
def plot_interaction(df,info1,info2):
    edges = df[[info1,info2]]
    data = edges.groupby([info1,info2], as_index=False).value_counts()

    sns.set_style('white')
    fig, ax = plt.subplots(figsize=(6,6))
    sns.scatterplot(x=info2, y=info1, data = data, size = 'count', hue = 'count', palette="RdBu", sizes=(1,600))
    plt.legend(bbox_to_anchor=(1.2,1), loc='upper right', borderaxespad = 0)
    plt.show()

In [56]:
def plot_network(df, info1, info2, number_messages, seed):
	a = df[[info1,info2]]
	a = a.groupby(info1).value_counts().to_frame()
	a.reset_index(inplace = True)
	a.rename(columns = {info1:info1, info2:info2, 0:'count'}, inplace = True)
	strong_connections = a[a['count'] > number_messages] #an average of 10 messages per month
	G = nx.DiGraph()
	strong_connections['edges'] = strong_connections[[info1,info2]].apply(tuple, axis = 1)
	edge_list = strong_connections['edges']
	G.add_edges_from(edge_list)
	fig, ax = plt.subplots()
	pos = nx.random_layout(G, seed = seed)
	nx.draw_networkx_nodes(G, pos, ax=ax)
	nx.draw_networkx_labels(G, pos, ax=ax)
	curved_edges = [edge for edge in G.edges() if reversed(edge) in G.edges()]
	straight_edges = list(set(G.edges()) - set(curved_edges))
	nx.draw_networkx_edges(G, pos, ax=ax, edgelist=straight_edges)
	nx.draw_networkx_edges(G, pos, ax=ax, edgelist=curved_edges, connectionstyle=f'arc3, rad = 0.1')
	plt.show()

In [57]:
def ploting_id_x_month(hue, col, param = 1.35, pal = "RdBu"):
    '''hue = col in df_1, col = col in employee'''
    sns.set_style('white')
    fig, ax = plt.subplots(figsize=(5,5))
    msg = messages[['sender',hue,'timestamp']]
    msg.set_index('timestamp', inplace = True)
    dt = msg.groupby([msg.index.month, 'sender', hue]).value_counts().to_frame()
    dt.reset_index(inplace = True)
    dt.rename(columns = {'timestamp' : 'month', 'sender':'id', 0:'count'}, inplace = True)
    dt = dt[['month','id']]
    dict_used = dict(zip(employees['id'],employees[col]))
    dt[col] = dt['id'].map(dict_used)
    np.random.seed(0)
    s = np.random.normal(0, 0.1, dt.shape[0])
    dt['month'] = dt['month'] + s
    sns.scatterplot(data = dt, x = 'month', y='id', hue = col, palette = pal)
    plt.legend(bbox_to_anchor=(param,1), loc='upper right', borderaxespad = 0)
    plt.title('Ids that sended messages')
    plt.show()

In [63]:
def dpt_to_emp(mes):
    msg = mes[['receiver','sender_dpt']]
    msg.drop_duplicates(inplace=True)
    msg['count'] = np.where(msg['receiver'] > 0, 1, 0)
    msgs = msg[['sender_dpt', 'count']]
    x = msgs.groupby('sender_dpt').value_counts().to_frame()
    x.reset_index(inplace = True)
    x.rename(columns = {0:'number of employees reached'}, inplace = True)
    y = x.iloc[:,[0,2]]
    print(y.head(6))

In [65]:
def plot_dpt_to_emp(mes, ex = "sender_dpt", string=""):
	''' ex = x-axis'''
	msg = mes[['receiver', ex, 'receiver_dpt']]
	msg.drop_duplicates(inplace=True)
	msg['count'] = np.where(msg['receiver'] > 0, 1, 0)
	msgs = msg[[ex,'receiver_dpt', 'count']]
	x = msgs.groupby([ex,'receiver_dpt']).value_counts().to_frame()
	x.reset_index(inplace = True)
	x.rename(columns = {0:'number of employees reached'}, inplace = True)
	y = x.iloc[:,[0,1,3]]
	sns.set_style('whitegrid')
	sns.barplot(data=y, x=ex, y='number of employees reached', hue="receiver_dpt", palette="RdBu")
	plt.legend(bbox_to_anchor=(1.3,1), loc='upper right', borderaxespad = 0)
	plt.xlabel("sender department")
	title = "number of messages sent by department" + string
	plt.title(title)
	plt.show()

In [60]:
def messages_sent(mes):
	y = mes['sender'].value_counts(sort=True).to_frame().reset_index()
	y.rename(columns = {'index':'sender id', 'sender':'number of messages sent'}, inplace = True)
	print(y.head(5))

In [61]:
def highly_connected(mes, n):
	x = mes[['sender', 'sender_dpt']]
	y = x.groupby('sender_dpt').value_counts().to_frame()
	y.reset_index(inplace = True)
	y.rename(columns = {0:'count'}, inplace = True)
	z = y[y['count'] >= n]
	z_dpts = z['sender_dpt']
	y_dpts = y['sender_dpt']
	b = z_dpts.to_frame()
	a = y_dpts.to_frame()
	a_df = a.groupby('sender_dpt').value_counts().to_frame()
	b_df = b.groupby('sender_dpt').value_counts().to_frame()
	a_df.reset_index(inplace = True)
	a_df.rename(columns = {0:'count'}, inplace = True)
	b_df.reset_index(inplace = True)
	b_df.rename(columns = {0:'count'}, inplace = True)
	b_df['% of strongly influent employees'] = 100*b_df['count']/a_df['count']
	b_df['% of strongly influent employees'] = b_df['% of strongly influent employees'].round(decimals=2)
	per_df = b_df[['sender_dpt', '% of strongly influent employees']]
	print(per_df)

In [1]:
def plot_interactions_and_network(df,info1,info2, number_messages, seed):
    sns.set_style('white')
    edges = df[[info1,info2]]
    data = edges.groupby([info1,info2], as_index=False).value_counts()
    fig, ax = plt.subplots(1,2, figsize=(15,6))

    sns.scatterplot(ax = ax[0], x=info2, y=info1, data = data, size = 'count', hue = 'count', palette="RdBu", sizes=(1,600))
    ax[0].legend(bbox_to_anchor=(1.16,1), loc='upper right', borderaxespad = 0)

    a = df[[info1,info2]]
    a = a.groupby(info1).value_counts().to_frame()
    a.reset_index(inplace = True)
    a.rename(columns = {info1:info1, info2:info2, 0:'count'}, inplace = True)
    strong_connections = a[a['count'] > number_messages] #an average of 10 messages per month
    G = nx.DiGraph()
    strong_connections['edges'] = strong_connections[[info1,info2]].apply(tuple, axis = 1)
    edge_list = strong_connections['edges']
    G.add_edges_from(edge_list)

    pos = nx.random_layout(G, seed = seed)
    nx.draw_networkx_nodes(G, pos, ax=ax[1])
    nx.draw_networkx_labels(G, pos, ax=ax[1])
    curved_edges = [edge for edge in G.edges() if reversed(edge) in G.edges()]
    straight_edges = list(set(G.edges()) - set(curved_edges))
    nx.draw_networkx_edges(G, pos, ax=ax[1], edgelist=straight_edges)
    nx.draw_networkx_edges(G, pos, ax=ax[1], edgelist=curved_edges, connectionstyle=f'arc3, rad = 0.1')
    plt.show()