In [188]:
import pandas as pd 
import logging
from gensim.models import TfidfModel
from nltk.corpus import stopwords
from nltk import download
from gensim.similarities import SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
import gensim.downloader as api
import pickle 
from gensim.corpora import Dictionary
from tqdm import tqdm

In [None]:

# ['Title', 'Type', 'Sector','Key words', 'Problem/Opportunity', 
#           'Description', 'Added Value','Impact']

columns = ["Key words","Title","Description"]


path = "./dependencies/"


In [176]:
# Initialize logging.
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


# Import and download stopwords from NLTK.
download('stopwords')  # Download stopwords list.
stop_words = stopwords.words('english')
portuguese = stopwords.words('portuguese')
stop_words.extend(portuguese)


def file_path(column, variable_name, path=path):
    return path+"".join(column.split())+"_"+variable_name+".pickle"


df = pd.read_excel('./Example of the original database (1).xlsx')
df = df.iloc[:, :11]

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hhich\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [177]:
df.head()

Unnamed: 0,First Name,Last Name,Employee Name,Title,Type,Sector,Key words,Problem/Opportunity,Description,Added Value,Impact
0,Preston,Crawford,Preston Crawford,Brand Websites & Web Pages,Product/Service,Marketing Digital,"Website, Business, commerce",grow your business and increase leads,A confident online presence is essential and c...,"Business growth, confidance of the clients, be...","Profit growth, customer's loyalty"
1,Violet,Phillips,Violet Phillips,Microsites & Topic Hubs,Product/Service,Marketing Digital,"Microsite, Business, contents",focuses on branded content or a single topic ...,A microsite is a website separate from your ma...,"Being more professional, details, innovative",specific contents and well organized business
2,Frederick,Walker,Frederick Walker,Blogs & Blog Posts,Product/Service,Marketing Digital,"Blog, contents, search engine",connect with your customers and answer their p...,The main purpose of a blog is to connect with ...,blog with high-quality and relevant blog posts...,source of content that can be repurposed into ...
3,Wilson,Henderson,Wilson Henderson,Videos,Product/Service,Marketing Digital,"Video, contents, audiance","explainer, onboarding, promotional, social, a...",Another common digital marketing idea is to in...,a versatile and shareable tool to reach their ...,being efficient and convenient
4,Thomas,Bailey,Thomas Bailey,Ebooks,Product/Service,Marketing Digital,"Ebook, Digital, audiance",They can be downloadable gifts to prospects an...,EBooks are a great way to strengthen your bran...,strengthen your brand and reach a broad audience,feels like a personal object of value you’re ...


# creating the models

In [12]:
model = api.load('word2vec-google-news-300')

2022-12-29 23:41:31,456 : INFO : loading projection weights from C:\Users\hhich/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2022-12-29 23:42:57,799 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from C:\\Users\\hhich/gensim-data\\word2vec-google-news-300\\word2vec-google-news-300.gz', 'binary': True, 'encoding': 'utf8', 'datetime': '2022-12-29T23:42:57.798112', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'load_word2vec_format'}


In [179]:
def preprocess(sentence,stop_words=stop_words):
    sentence = str(sentence)
    return [w for w in sentence.lower().split() if w not in stop_words]

def save_variable(column,variable_name,variable): 
    p = file_path(column,variable_name,path = path)
    with open(p,"wb") as f :
        pickle.dump(variable,f)
        
        
def createtheAI(column,model=model):
    sentences = df[column].values
    processed_sentences = []
    for sentence in sentences : 
        processed_sentences.append(preprocess(sentence))
    # Define dictionary and create bag of words
    dictionary = Dictionary(processed_sentences)
    bow = [dictionary.doc2bow(sentence) for sentence in processed_sentences]
    # Creating the Term Frequency - Inverse Document Frequency
    tfidf = TfidfModel(bow)
    tfidf_sentences = [tfidf[sentence] for sentence in bow]
    # Term Indexing and Similarity Matrix
    termsim_index = WordEmbeddingSimilarityIndex(model)
    termsim_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, tfidf)
    # Saving the envirenmental variables
    save_variable(column, "termsim_matrix", termsim_matrix)
    save_variable(column,"tfidf",tfidf)
    save_variable(column,"dictionary",dictionary)

> creating the Ai here

In [62]:
# Creating and saving the AI here
# The columns list contains a list of the columns that we are going to use
# Check the first cell to find more info about the columns variable
for i in columns:
    # Calling the function to create the AI here
    # All variables are saved in the dependencies folder
    createtheAI(i)

2022-12-30 00:29:36,465 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2022-12-30 00:29:36,469 : INFO : built Dictionary(14 unique tokens: ['business,', 'commerce', 'website,', 'contents', 'microsite,']...) from 400 documents (total 411 corpus positions)
2022-12-30 00:29:36,470 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(14 unique tokens: ['business,', 'commerce', 'website,', 'contents', 'microsite,']...) from 400 documents (total 411 corpus positions)", 'datetime': '2022-12-30T00:29:36.470750', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22621-SP0', 'event': 'created'}
2022-12-30 00:29:36,471 : INFO : collecting document frequencies
2022-12-30 00:29:36,472 : INFO : PROGRESS: processing document #0
2022-12-30 00:29:36,474 : INFO : TfidfModel lifecycle event {'msg': 'calculated IDF weights for 400 documents and 14 features (411 matrix non-zeros)', 'datetime': '2022-12-30T0

# similarity between 2 ideas

In [80]:
# loads variables 
def load_variables(column):
    l = ["termsim_matrix","tfidf","dictionary"]
    paths = []
    for variable_name in l :
        paths.append(file_path(column, variable_name))
    with open(paths[0],"rb") as f :
        termsim_matrix = pickle.load(f)
    with open(paths[1],"rb") as f :
        tfidf = pickle.load(f)
    with open(paths[2],"rb") as f :
        dictionary = pickle.load(f)
        
    return termsim_matrix, tfidf, dictionary

# preprocessing the input


def prepare_input(s, dictionary, tfidf):
    precessed_input = preprocess(s)
    bow_input = dictionary.doc2bow(precessed_input)
    tfidf_input = tfidf[bow_input]
    return tfidf_input


def calculate_similarity(s1, s2, column):
    termsim_matrix, tfidf, dictionary = load_variables("Title")
    in1 = prepare_input(s1, dictionary, tfidf)
    in2 = prepare_input(s2, dictionary, tfidf)
    similarity = termsim_matrix.inner_product(
        in1, in2, normalized=(True, True))
    return similarity


def similarity_between_two_rows(idx1, idx2, available_columns=columns):
    sim = 0
    for column in available_columns:
        s1 = df.loc[idx1, column]
        s2 = df.loc[idx2, column]
        sim += calculate_similarity(s1, s2, column)
    sim = sim/len(available_columns)
    return sim


In [None]:
n = len(df)
d = {}
for i in tqdm(range(0,n-1)):
    for j in range(i+1,n):
        s = similarity_between_two_rows(i,j,available_columns=columns)
        d[f"{i},{j}"] = s
with open(path+'d.pickle','wb') as f : 
    pickle.dump(d,f)

In [184]:
# loads the dictionary that contains the similarity coefficients
def load_sim_dictionary():
    with open(path+'d.pickle','rb') as f : 
        d = pickle.load(f)
    return d

# Outputs keys for the similar ideas
def similar_ideas(thresh=0.1):
    d = load_sim_dictionary()
    v = list(d.values())
    k = list(d.keys())
    l = []
    for i in range(len(v)): 
        if v[i] > thresh : 
            l.append(k[i])
    return l 


# Outputs the names of the users that have similar ideas
def users_with_sim_ideas(thresh):
    l = similar_ideas(thresh)
    names = []
    for i in l :
        tmp = [] 
        indexes = i.split(',')
        emp1 = df.loc[int(indexes[0]),'Employee Name']
        emp2 = df.loc[int(indexes[1]),'Employee Name']
        tmp = [emp1,emp2]
        names.append(tmp)
    return names     

> using the AI here

In [161]:
d = load_sim_dictionary()

In [185]:
# using the previewsly created AI
# this is a small example of how to calculate similarity
column = 'Title'
s1 = 'Social Media Marketing'
s2 = 'Launch Social Media Audience Lookalike Ads',
calculate_similarity(s1,s2,column)

0.43527937

In [186]:
k = list(d.keys())
v = list(d.values())
m = max(v)
idx = v.index(m)
max_combination = k[idx]
print(f"""
maximum similarity coeff is {m:.4} 
which can be found when comaparing the lines { max_combination } of the dataset""")



maximum similarity coeff is 0.5531 
which can be found when comaparing the lines 207,215 of the dataset


In [187]:
users_with_sim_ideas(0.4)

[['Rosie Bailey', 'Paul Allen'],
 ['Luke Williams', 'Steven Walker'],
 ['Gianna Scott', 'Thomas Stewart'],
 ['Catherine Grant', 'Eric Perry'],
 ['Richard Cameron', 'Paul Allen'],
 ['Richard Morris', 'Alen Johnston'],
 ['David Edwards', 'Melissa Nelson'],
 ['Chester Myers', 'Ted Adams'],
 ['Miranda Stewart', 'Kirsten Kelly'],
 ['Kellan West', 'Charlie Richardson'],
 ['Violet Alexander', 'Frederick Lloyd'],
 ['Violet Alexander', 'Amy Wright'],
 ['Lucy Owens', 'Rosie Clark'],
 ['Miley Nelson', 'Kelsey Higgins'],
 ['Bruce Anderson', 'Miranda Warren'],
 ['Bruce Anderson', 'Charlie Thompson'],
 ['Amelia Fowler', 'Edgar Stewart'],
 ['Sam Carter', 'Caroline Martin'],
 ['James Sullivan', 'Adelaide Adams'],
 ['Miranda Warren', 'Charlie Thompson'],
 ['Kirsten Ellis', 'Lenny Mason']]