# Code 2: When input is the Company Website along with Description
    
In this case the description of the input company website may or may not be present in the dataset

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from textblob import Word

# Function to clean the description of companies
def description_cleaning(df):
    stop = stopwords.words('english')
    
    # Removing punctuations and all digits from description
    filterString = string.punctuation + '“”|”' + string.digits
    df['Filter Description'] = df['Description'].apply(lambda x: x.translate(str.maketrans(filterString,' '*len(filterString),'')))
    
    # Removing all single characters
    df['Filter Description'] = df['Filter Description'].replace('\s+[a-zA-Z]\s+', ' ', regex=True)

    # Removing single characters in beginning
    df['Filter Description'] = df['Filter Description'].replace('\^[a-zA-Z]\s+', ' ', regex=True)

    # Removing multiple spaces
    df['Filter Description'] = df['Filter Description'].replace('\s+', ' ', regex=True)

    # Converting text to lowercase
    df['Filter Description'] = df['Filter Description'].apply(lambda x: x.lower())

    # Removing stop words from description
    df['Filter Description'] = df['Filter Description'].str.split(' ').apply(lambda x: ' '.join(k for k in x if k not in stop))

    # Lemmatizing all words in the description
    df['Filter Description'] = df['Filter Description'].apply(lambda x: "".join([Word(word).lemmatize() for word in x]))
    
    # Drop the unfiltered description column
    df.drop("Description", axis = 1, inplace = True)
    
    return df

# Function to find similarity between two description using cosine similarity
def find_similarity(X, Y):
    # tokenization 
    X_list = word_tokenize(X)  
    Y_list = word_tokenize(Y)

    l1 =[];l2 =[] 

    # remove stop words from string 
    X_set = set(X_list)  
    Y_set = set(Y_list)

    # form a set containing keywords of both strings  
    rvector = X_set.union(Y_set)  
    for w in rvector: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y_set: l2.append(1) 
        else: l2.append(0) 
    c = 0

    # cosine formula  
    for i in range(len(rvector)): 
            c+= l1[i]*l2[i]

    cosine = c / float((sum(l1)*sum(l2))**0.5)
    
    return cosine

# Function to find similar company to the input company website
def find_similar_company(website, description):
    
    # Reading the csv file containg description and website of the company
    data = pd.read_csv("SimilarCompaniesRecommendation.csv")

    # Function call to clean the description
    data = description_cleaning(data)
    
    # Cleaning the description of the company for input website
    des1 = description_cleaning(pd.DataFrame([Description], columns = ["Description"]))["Filter Description"].values[0]
    
    
    # List to store similarity values between descriptions
    Similarity = []
    
    # Loop for finding similarity values between the description
    for i in range(data.shape[0]):
        des2 = data["Filter Description"][i]
        # Function call to to find similarity between two description using cosine similarity
        similarity_between_des1_and_des2 = find_similarity(des1, des2)
        Similarity.append([data["Website"][i], similarity_between_des1_and_des2])
        
    # Create the pandas DataFrame which contains the website and similarity value with respect to input website
    df = pd.DataFrame(Similarity, columns = ['Website', 'Similarity Value'])
    
    # Sorting the company based on similarity value of description
    df.sort_values(by=['Similarity Value'], ascending = False, inplace = True)
    
    # Eliminate whose similarity value is one because it is the same company
    df = df[df["Similarity Value"]!=1]
    
    # Giving rank to the company
    Rank = [i+1 for i in range(df.shape[0])]
    df["Rank"] = Rank
    
    # Dropping the Similarty value column
    df.drop("Similarity Value", axis = 1, inplace = True)
        
    return df.reset_index(drop=True)

# Results

Lower the rank higher the similarity means Rank 1 is most similar to the input company and as rank increases similarity decreases

In [2]:
# Input Parameter 1
CompanyWebsite = "Gitlab.com"

#Input Parameter 2
Description = "From project planning and source code management to CI/CD and monitoring, GitLab is a complete DevOps platform, delivered as a single application. Only GitLab enables Concurrent DevOps to make the software lifecycle 200% faster."

print("Similar Company for Company:",CompanyWebsite,"are listed below\n")
print(find_similar_company(CompanyWebsite, Description))  # Function call to get similar company

Similar Company for Company: Gitlab.com are listed below

           Website  Rank
0    Cloudbees.com     1
1  Squarespace.com     2
2          Wix.com     3


In [3]:
# Input Parameter 1
CompanyWebsite = "Cloudbees.com"

#Input Parameter 2
Description = "Reduce risk, optimize software delivery and accelerate innovation with CloudBees - the industry-leading DevOps technology platform. Build Stuff That Matters."

print("Similar Company for Company:",CompanyWebsite,"are listed below\n")
print(find_similar_company(CompanyWebsite, Description))  # Function call to get similar company

Similar Company for Company: Cloudbees.com are listed below

           Website  Rank
0       Gitlab.com     1
1  Squarespace.com     2
2          Wix.com     3


In [4]:
# Input Parameter 1
CompanyWebsite = "Squarespace.com"

#Input Parameter 2
Description = "Squarespace is the all-in-one solution for anyone looking to create a beautiful website. Domains, eCommerce, hosting, galleries, analytics, and 24/7 support all included."

print("Similar Company for Company:",CompanyWebsite,"are listed below\n")
print(find_similar_company(CompanyWebsite, Description))  # Function call to get similar company

Similar Company for Company: Squarespace.com are listed below

         Website  Rank
0        Wix.com     1
1     Gitlab.com     2
2  Cloudbees.com     3


In [5]:
# Input Parameter 1
CompanyWebsite = "Wix.com"

#Input Parameter 2
Description = "Create a free website with Wix.com. Choose a stunning template and customize anything with the Wix website builder—no coding skills needed. Create yours today!"

print("Similar Company for Company:",CompanyWebsite,"are listed below\n")
print(find_similar_company(CompanyWebsite, Description))  # Function call to get similar company

Similar Company for Company: Wix.com are listed below

           Website  Rank
0  Squarespace.com     1
1       Gitlab.com     2
2    Cloudbees.com     3
