In [1]:
import pandas as pd
import numpy as np

# Read in the IDF vector
idf_vector = pd.read_csv("/Users/kae/CS341/data/training_idf_vector.csv" ,header=None, names=['IDF'], index_col=0, encoding = "ISO-8859-1")
idf_vector

Unnamed: 0,IDF
3,7.313554
cranberries,7.313554
siteÃ­s,7.313554
maintainable,7.313554
fermented,7.313554
raspberry,7.313554
pomegranate,7.313554
juice,7.313554
chinesespeaking,7.313554
loop,7.313554


In [2]:
# The idf vector can be converted to a numpy array for linear algebra calculatons via these command:

# Get the idf values in a column vector
idf_values = list(idf_vector.values)

# Get the words in a column vector. The initial order mathes the values in the idf_values_array
idf_words = list(idf_vector.index.values)
# Perform a reshape on the words array to get it in a better format

idf_set = set(idf_words)
idf_map = dict(zip(idf_words, idf_values))


In [3]:
import pandas as pd

company_list = pd.read_csv('/Users/kae/CS341/data/category_training_labeled_fixed.csv', encoding = "ISO-8859-1")
n_companies = np.shape(company_list)[0]
company_graph = np.empty((n_companies,n_companies))
company_graph[:] = -1
company_graph

array([[-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       ..., 
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.],
       [-1., -1., -1., ..., -1., -1., -1.]])

In [4]:
import string
import nltk
nltk.download('punkt')

#Gets the words out of the labeled descriptions
def get_words(df):
    punctuation = '[^\w\s]'
    txt = df.str.lower().str.replace(punctuation, ' ').str.cat(sep=' ')
    stopwords = set(nltk.corpus.stopwords.words('english'))
    words = nltk.tokenize.word_tokenize(txt)
    return set(words) - stopwords

[nltk_data] Downloading package punkt to /Users/kae/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
company_words_list = [set()]*len(company_list)
for i in range(len(company_list)):
    start_index = 5
    end_index = 7
    company_words = get_words(company_list.iloc[i,start_index:end_index])
    company_words_list[i] = company_words

In [6]:
#given a target word and a pandas data frame of companies, returns a list of companies whose descriptions contain the target word
def get_companies(target_word, company_words_list):
    candidate_set = set()
    for i in range(len(company_words_list)):
        company_description = company_words_list[i]
        if target_word in company_description:
            candidate_set.add(i)
    return list(candidate_set)

In [7]:
def get_similarity(company_index_1, company_index_2, company_words_list, idf_set, idf_map):
    company_1 = company_words_list[company_index_1]
    company_2 = company_words_list[company_index_2]
    intersection = company_1 & company_2
    union = company_1 | company_2
    if len(union) == 0:
        return 0
    intersection_score = 0.0
    union_score = 0.0
    for word in union:
        if word in idf_set:
            word_score = idf_map[word][0]
            union_score += word_score
            if word in intersection:
                intersection_score += word_score
                
    return intersection_score/union_score

def fakeget_similarity(company_index_1, company_index_2, company_list, idf_vector):
    company_1 = get_words(company_list.iloc[company_index_1,5:7])
    company_2 = get_words(company_list.iloc[company_index_2,5:7])
    intersection = company_1 & company_2
    union = company_1 | company_2
    intersection_score = 0.0
    union_score = 0.0
    idf_set = set(idf_vector.index.values)
    for word in union:
        if word in idf_set:
            word_score = idf_vector.ix[word][0]
            union_score += word_score
            if word in intersection:
                intersection_score += word_score
                
    return intersection_score/union_score

In [8]:
n_updated_elements = 0
n_companies = len(company_list)
cutoff = 0.1
for i in range(n_companies):
    for k in range((i+1), n_companies):
        edge_weight = get_similarity(i, k, company_words_list, idf_set, idf_map)
        if edge_weight >= cutoff:
            company_graph[i][k] = edge_weight
            company_graph[k][i] = edge_weight
        
#removing -1's and ensuring 0's along the diagonal

np.fill_diagonal(company_graph, 0)
company_graph[company_graph < 0] = 0

In [37]:
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib import pylab
G=nx.Graph()
G.add_nodes_from(company_list['domain'].values)
#G.add_nodes_from([x in range(n_companies)])
for i in range(n_companies):
    for k in range(i, n_companies)
        if company_graph[i][k] != 0:
            G.add_edge(i,k)
            G[i][k]['weight'] = company_graph[i][k]


def save_graph(graph,file_name):
    #initialze Figure
    plt.figure(num=None, figsize=(200, 200), dpi=80)
    plt.axis('off')
    fig = plt.figure(1)
    #pos = nx.spring_layout(graph)
    pos = nx.spring_layout(graph,k=0.9,iterations=5)
    nx.draw_networkx_nodes(graph,pos,node_color='g',node_size = 30,linewidths=0)
    nx.draw_networkx_edges(graph,pos,edge_color='b')
    nx.draw_networkx_labels(graph,pos,label_size = 30)

    cut = 1.00
    xmax = cut * max(xx for xx, yy in pos.values())
    ymax = cut * max(yy for xx, yy in pos.values())
    plt.xlim(0, xmax)
    plt.ylim(0, ymax)

    plt.savefig(file_name,bbox_inches="tight")
    pylab.close()
    del fig


save_graph(G,"3K_graph.pdf")
#can also be saved in .svg, .png. or .ps formats

In [36]:
company_list['domain'].values

array(['conferencecloud.co', 'terminus.com', 'galileoprocessing.com', ...,
       'kupuhawaii.org', 'whalepath.com', 'satellitesolutionsworldwide.com'], dtype=object)