# Import

In [0]:
import string
from itertools import chain
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from pyspark.ml.feature import *
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import sparknlp
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

spark = sparknlp.start()
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

from pyspark.sql.types import *
from pyspark.sql.functions import *
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Helper Functions

In [0]:
# HELPER FUNCTIONS
def cleanup_text(text):
    # clean up text data by removing stopwords, punctuation, and lemmatizing words
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.lower() not in stop_words]  
    words = [word for word in words if word not in string.punctuation]   
    words = [lemmatizer.lemmatize(word, pos='v') for word in words] 
    cleaned_text = " ".join(words)
    return cleaned_text

from pyspark.sql.functions import col, size
def add_features(companies):
    # creates new columns based on existing columns to provide additional information
    companies = companies.withColumn('number_of_similar_companies', size(col('similar')))
    companies = companies.withColumn('number_of_locations', size(col('formatted_locations')))
    return companies

def fill_missing_values(companies, features):
    #  fills missing values in the specified columns. For numeric columns, missing values
    # are filled with 0, and for label columns, missing values are filled with 'none'
    for column in features:
        if column in ['number_of_similar_companies', 'employees_in_linkedin', 'followers', 'stability']:
            companies = companies.withColumn(f'{column}', when(companies[column].isNull(), -1).otherwise(companies[column]))
        else:
            companies = companies.withColumn(f'{column}', when(companies[column].isNull(), 'none').otherwise(companies[column]))
    return companies

def feature_to_oneHotEncoder(companies, feature):
    # convert a categorical feature to one-hot encoded vectors
    string_indexer = StringIndexer(inputCol=feature, outputCol=f"{feature}_index")
    one_hot_encoder = OneHotEncoder(inputCol=f"{feature}_index", outputCol=f"{feature}_feature")
    pipeline = Pipeline(stages=[string_indexer, one_hot_encoder])
    companies = pipeline.fit(companies).transform(companies)
    return companies

def category_to_index(companies, feature, dict_index):
    # convert a categorical feature to indexed values
    mapping_expr = create_map([lit(x) for x in chain(*dict_index.items())])
    if feature == 'popularity_category':
        companies = companies.withColumn(f"label", mapping_expr[col(feature)])
    else:
        companies = companies.withColumn(f"{feature}_feature", mapping_expr[col(feature)])
    return companies

def create_word2vec(companies, text_column):
    # create Word2Vec embeddings for a text column
    cleanup_text_udf = udf(cleanup_text)
    punctuation_pattern = r'[^\w\s]'
    companies = companies.withColumn(f'cleaned_{text_column}', F.regexp_replace(col(text_column), punctuation_pattern, ""))
    companies = companies.withColumn(f'cleaned_{text_column}', cleanup_text_udf(col(f'cleaned_{text_column}')))
    # tokenization
    tokenizer = Tokenizer(inputCol=f'cleaned_{text_column}', outputCol=f"{text_column}_tokens")
    companies = tokenizer.transform(companies)
    # learn a Word2Vec model from the tokenized words
    k = 10
    word2vec = Word2Vec(vectorSize=k, minCount=1, inputCol=f"{text_column}_tokens", outputCol=f"{text_column}_feature")
    word2vec_model = word2vec.fit(companies)
    # transform the tokenized words into word vectors
    companies = word2vec_model.transform(companies)
    return companies

def convert_to_label(companies, label):
    # convert a label column to numeric indices
    indexer = StringIndexer(inputCol=label, outputCol=f"label")
    companies = indexer.fit(companies).transform(companies)
    return companies

from pyspark.sql.functions import lit, create_map
def index_to_category(companies, feature, output_col, dict_index):
    # Create a mapping expression to convert indexed values to strings
    dict_index = {v: k for k, v in dict_index.items()}
    reverse_mapping_expr = create_map([lit(v) for v in chain(*dict_index.items())])
    companies = companies.withColumn(output_col, reverse_mapping_expr[col(feature)])
    return companies

from pyspark.sql.functions import col, when
def show_null_precentage(companies, fetures):
    #display the percentage of null values in specified columns
    null_counts = companies.select([sum(col(column).isNull().cast("int")).alias(column) for column in fetures])
    total_rows = companies.count()
    null_percentage = null_counts.select([(col(column) / total_rows * 100).alias(column + "_null_percentage") for column in fetures])
    null_percentage.display()

def embedding_text_features(train_data, test_data, text_features):
    # embedding text columns and converting the resulting array of embeddings into dense vectors
    for feature in text_features:
        # Create a DocumentAssembler for the current feature
        document = DocumentAssembler() \
            .setInputCol(feature) \
            .setOutputCol("document")

        embeddings = BertSentenceEmbeddings.pretrained("sent_small_bert_L2_128") \
            .setInputCols(["document"]) \
            .setOutputCol("sentence_embeddings")\
            .setCaseSensitive(True) \
            .setMaxSentenceLength(128)

        pipeline = Pipeline(stages=[document,
                                    embeddings])

        transformed_df = pipeline.fit(train_data).transform(test_data)

        # Join the transformed DataFrame with the original DataFrame using the row_id column
        test_data = test_data.join(transformed_df.select("id", f"sentence_embeddings"), on="id", how="left")
        test_data = test_data.withColumnRenamed("sentence_embeddings", f"{feature}_test")

        # Define a function to convert array to DenseVector
        def array_to_vector(arr):
            return Vectors.dense(arr)
        # Create a UDF from the function
        array_to_vector_udf = udf(array_to_vector, VectorUDT())

        test_data = test_data.withColumn(f"{feature}_feature", flatten(expr(f"transform({feature}_test.embeddings, x -> x)")))
        test_data = test_data.withColumn(f"{feature}_feature", array_to_vector_udf(col(f"{feature}_feature")))

    return test_data 

def create_city_state_mapping(companies):
    companies = companies.withColumn("city", when(companies["headquarters"].isNull(), "other").otherwise(split("headquarters", ",")[0]))
    companies = companies.withColumn("state", when(companies["headquarters"].isNull(), "other").otherwise(split("headquarters", ",")[1]))
    companies = companies.withColumn("state", when(companies["state"].isNull(), "other").otherwise(companies["state"]))

    # Count occurrences of each city and state
    city_counts = companies.groupBy("city").count().alias("city_counts")
    state_counts = companies.groupBy("state").count().alias("state_counts")

    # Filter out cities and states appearing less than 5 times
    city_filtered = city_counts.filter(city_counts["count"] >= 5)
    state_filtered = state_counts.filter(state_counts["count"] >= 2)

    # Create a set of cities and states to replace with "other"
    city_set = set(city_filtered.select("city").rdd.flatMap(lambda x: x).collect())
    state_set = set(state_filtered.select("state").rdd.flatMap(lambda x: x).collect())

    # Replace cities and states not in the set with "other"
    companies = companies.withColumn("filtered_city", when(companies["city"].isin(city_set), companies["city"]).otherwise(lit("other"))) \
                         .withColumn("filtered_state", when(companies["state"].isin(state_set), companies["state"]).otherwise(lit("other")))

    unique_cities = companies.select("filtered_city").distinct().rdd.map(lambda x: x[0]).collect()
    city_dict = {city: index for index, city in enumerate(unique_cities)}

    unique_states = companies.select("filtered_state").distinct().rdd.map(lambda x: x[0]).collect()
    states_dict = {state: index for index, state in enumerate(unique_states)}

    cities = udf(lambda x: city_dict[x])
    states = udf(lambda x: states_dict[x])

    return companies, city_set, state_set, cities, states

def create_city_state_indices(companies, city_set, state_set, cities, states, is_test=False):
    if is_test:
        companies = companies.withColumn("city", when(companies["headquarters"].isNull(), "other").otherwise(split("headquarters", ",")[0]))
        companies = companies.withColumn("state", when(companies["headquarters"].isNull(), "other").otherwise(split("headquarters", ",")[1]))
        companies = companies.withColumn("state", when(companies["state"].isNull(), "other").otherwise(companies["state"]))

        # Replace cities and states not in the set with "other"
        companies = companies.withColumn("filtered_city", when(companies["city"].isin(city_set), companies["city"]).otherwise(lit("other"))) \
                            .withColumn("filtered_state", when(companies["state"].isin(state_set), companies["state"]).otherwise(lit("other")))

    # Create indexes of city and state to one hot vector
    companies = companies.withColumn("city_index", cities(col('filtered_city'))) \
                         .withColumn("state_index", states(col('filtered_state'))) \
                                
    companies = companies.withColumn('city_index', col('city_index').cast('int'))
    companies = companies.withColumn('state_index', col('state_index').cast('int'))
    return companies

def create_city_state_one_hot(old_companies, new_companies):
    encoder = OneHotEncoder(inputCol='city_index', outputCol='city_feature', dropLast=False)
    new_companies = encoder.fit(old_companies.drop('city_feature')).transform(new_companies)

    encoder = OneHotEncoder(inputCol='state_index', outputCol='state_feature', dropLast=False)
    new_companies = encoder.fit(old_companies.drop('state_feature')).transform(new_companies)

    return new_companies

def create_one_hot_from_dict(companies, feature_name, index_dict):
    indices = udf(lambda x: index_dict[x])
    companies = companies.withColumn(f"{feature_name}_index", indices(col(feature_name)))               
    companies = companies.withColumn(f"{feature_name}_index", col(f"{feature_name}_index").cast('int'))
    encoder = OneHotEncoder(inputCol=f"{feature_name}_index", outputCol=f"{feature_name}_feature", dropLast=False)
    companies = encoder.fit(companies).transform(companies)
    return companies


In [0]:
company_size_dict = {'none': 0,
                     '1 employee': 1,
                     '2-10 employees': 2,
                     '11-50 employees': 3,
                     '51-200 employees': 4,
                     '201-500 employees': 5,
                     '501-1,000 employees': 6,
                     '1,001-5,000 employees': 7,
                     '5,001-10,000 employees': 8,
                     '10,001+ employees': 9
                     }

meta_industries_dict = {'Manufacturing': 0,
            'Financial and Investment': 1,
            'Services': 2,
            'Miscellaneous': 3,
            'Healthcare and Medical': 4,
            'Technology': 5,
            'Retail and Consumer Goods': 6,
            'Education and Training': 7,
            'Government and Public Policy': 8,
            'Transportation and Logistics': 9,
            'Real Estate and Construction': 10,
            'Media and Entertainment': 11
            } 

organization_type_dict = {'Privately Held': 0,
                          'Self-Owned': 1,
                          'Nonprofit': 2,
                          'Partnership': 3,
                          'Self-Employed': 4,
                          'Educational': 5,
                          'Public Company': 6,
                          'Government Agency': 7,
                          'none': 8
                        }

label_dict = {"Unpopular": 0,
              "Neutral Popularity": 1,
              "Very Popular": 2
              }


meta_industries_12 = {
    'Furniture and Home Furnishings Manufacturing': 'Manufacturing',
    'Investment Banking': 'Financial and Investment',
    'Architecture and Planning': 'Services',
    'Wholesale': 'Services',
    'Travel Arrangements': 'Services',
    'Ranching': 'Miscellaneous',
    'Hospitals and Health Care': 'Healthcare and Medical',
    'Book and Periodical Publishing': 'Services',
    'Printing Services': 'Services',
    'Professional Training and Coaching': 'Services',
    'Computers and Electronics Manufacturing': 'Manufacturing',
    'Shipbuilding': 'Manufacturing',
    'Public Policy Offices': 'Government and Public Policy',
    'Software Development': 'Technology',
    'Outsourcing and Offshoring Consulting': 'Services',
    'Retail Groceries': 'Retail and Consumer Goods',
    'Education Administration Programs': 'Education and Training',
    'Plastics Manufacturing': 'Manufacturing',
    'Renewable Energy Semiconductor Manufacturing': 'Manufacturing',
    'Computer Networking Products': 'Technology',
    'Events Services': 'Services',
    'Information Services': 'Services',
    'Food and Beverage Services': 'Services',
    'Semiconductor Manufacturing': 'Manufacturing',
    'Business Consulting and Services': 'Services',
    'Insurance': 'Services',
    'Financial Services': 'Services',
    'Wireless Services': 'Services',
    'Computer Hardware Manufacturing': 'Technology',
    'Public Safety': 'Services',
    'Maritime Transportation': 'Transportation and Logistics',
    'Tobacco Manufacturing': 'Manufacturing',
    'Writing and Editing': 'Services',
    'Veterinary Services': 'Services',
    'Staffing and Recruiting': 'Services',
    'Accounting': 'Services',
    'International Affairs': 'Government and Public Policy',
    'Spectator Sports': 'Miscellaneous',
    'Glass, Ceramics and Concrete Manufacturing': 'Manufacturing',
    'Chemical Manufacturing': 'Manufacturing',
    'Mining': 'Miscellaneous',
    'E-Learning Providers': 'Technology',
    'Security and Investigations': 'Services',
    'Translation and Localization': 'Services',
    'Automation Machinery Manufacturing': 'Technology',
    'Computer and Network Security': 'Technology',
    'Political Organizations': 'Government and Public Policy',
    'Environmental Services': 'Government and Public Policy',
    'Oil and Gas': 'Miscellaneous',
    'Real Estate': 'Real Estate and Construction',
    'Think Tanks': 'Government and Public Policy',
    'Executive Offices': 'Miscellaneous',
    'Law Practice': 'Services',
    'Nanotechnology Research': 'Miscellaneous',
    'International Trade and Development': 'Government and Public Policy',
    'Personal Care Product Manufacturing': 'Manufacturing',
    'Philanthropic Fundraising Services': 'Services',
    'Entertainment Providers': 'Media and Entertainment',
    'Market Research': 'Media and Entertainment',
    'Movies, Videos, and Sound': 'Media and Entertainment',
    'Sporting Goods Manufacturing': 'Manufacturing',
    'Graphic Design': 'Services',
    'Technology, Information and Internet': 'Technology',
    'IT Services and IT Consulting': 'Technology',
    'Retail Office Equipment': 'Retail and Consumer Goods',
    'Wholesale Import and Export': 'Services',
    'Capital Markets': 'Financial and Investment',
    'Law Enforcement': 'Services',
    'Freight and Package Transportation': 'Transportation and Logistics',
    'Industrial Machinery Manufacturing': 'Manufacturing',
    'Non-profit Organizations': 'Miscellaneous',
    'Retail Art Supplies': 'Retail and Consumer Goods',
    'Animation and Post-production': 'Media and Entertainment',
    'Transportation, Logistics, Supply Chain and Storage': 'Transportation and Logistics',
    'Aviation and Aerospace Component Manufacturing': 'Transportation and Logistics',
    'Fundraising': 'Financial and Investment',
    'Railroad Equipment Manufacturing': 'Transportation and Logistics',
    'Construction': 'Real Estate and Construction',
    'Investment Management': 'Financial and Investment',
    'Utilities': 'Miscellaneous',
    'Retail Luxury Goods and Jewelry': 'Retail and Consumer Goods',
    'Warehousing and Storage': 'Transportation and Logistics',
    'Media Production': 'Media and Entertainment',
    'Gambling Facilities and Casinos': 'Media and Entertainment',
    'Defense and Space Manufacturing': 'Manufacturing',
    'Facilities Services': 'Services',
    'Government Relations Services': 'Government and Public Policy',
    'Advertising Services': 'Media and Entertainment',
    'Paper and Forest Product Manufacturing': 'Manufacturing',
    'Packaging and Containers Manufacturing': 'Manufacturing',
    'Telecommunications': 'Technology',
    'Medical Equipment Manufacturing': 'Healthcare and Medical',
    'Beverage Manufacturing': 'Manufacturing',
    'Restaurants': 'Retail and Consumer Goods',
    'Leasing Non-residential Real Estate': 'Real Estate and Construction',
    'Newspaper Publishing': 'Media and Entertainment',
    'Armed Forces': 'Miscellaneous',
    'Appliances, Electrical, and Electronics Manufacturing': 'Manufacturing',
    'Hospitality': 'Services',
    'Pharmaceutical Manufacturing': 'Healthcare and Medical',
    'Research Services': 'Services',
    'Retail Apparel and Fashion': 'Retail and Consumer Goods',
    'Photography': 'Media and Entertainment',
    'Wellness and Fitness Services': 'Services',
    'Truck Transportation': 'Transportation and Logistics',
    'Consumer Services': 'Services',
    'Wholesale Building Materials': 'Services',
    'Human Resources Services': 'Services',
    'Airlines and Aviation': 'Transportation and Logistics',
    'Machinery Manufacturing': 'Manufacturing',
    'Individual and Family Services': 'Services',
    'Motor Vehicle Manufacturing': 'Manufacturing',
    'Performing Arts': 'Media and Entertainment',
    'Museums, Historical Sites, and Zoos': 'Media and Entertainment',
    'Broadcast Media Production and Distribution': 'Media and Entertainment',
    'Banking': 'Financial and Investment',
    'Recreational Facilities': 'Miscellaneous',
    'Government Administration': 'Government and Public Policy',
    'Public Relations and Communications Services': 'Media and Entertainment',
    'Fisheries': 'Miscellaneous',
    'Medical Practices': 'Healthcare and Medical',
    'Religious Institutions': 'Miscellaneous',
    'Online Audio and Video Media': 'Media and Entertainment',
    'Artists and Writers': 'Miscellaneous',
    'Biotechnology Research': 'Healthcare and Medical',
    'Legal Services': 'Services',
    'Retail': 'Retail and Consumer Goods',
    'Civil Engineering': 'Services',
    'Libraries': 'Miscellaneous',
    'Alternative Dispute Resolution': 'Miscellaneous',
    'Manufacturing': 'Miscellaneous',
    'Design Services': 'Services',
    'Dairy Product Manufacturing': 'Manufacturing',
    'Higher Education': 'Education and Training',
    'Civic and Social Organizations': 'Miscellaneous',
    'Textile Manufacturing': 'Manufacturing',
    'Venture Capital and Private Equity Principals': 'Financial and Investment',
    'Mental Health Care': 'Healthcare and Medical',
    'Musicians': 'Media and Entertainment',
    'Farming': 'Miscellaneous',
    'Computer Games': 'Media and Entertainment',
    'Strategic Management Services': 'Services',
    'Food and Beverage Manufacturing': 'Manufacturing',
    'Primary and Secondary Education': 'Education and Training',
    'Alternative Medicine': 'Healthcare and Medical',
    'Legislative Offices': 'Services',
    'Administration of Justice': 'Services',
    'Mobile Gaming Apps': 'Media and Entertainment'
}

# Upload new & old companies trend and merge the linkedin companies data

In [0]:
companies = spark.read.parquet('/linkedin/companies')

# upload companies trend
old_companies_trend_df = pd.read_csv("/dbfs/FileStore/shared_uploads/omriitzhaki@campus.technion.ac.il/final_old_companies_trend_preprocess")
new_companies_trend_df = pd.read_csv("/dbfs/FileStore/shared_uploads/omriitzhaki@campus.technion.ac.il/final_new_companies_trend_preprocess")
old_companies = spark.createDataFrame(old_companies_trend_df)
new_companies = spark.createDataFrame(new_companies_trend_df)

# merge with linkedin companies data
old_companies = companies.join(old_companies, 'name', 'inner')
new_companies = companies.join(new_companies, 'name', 'inner')

# Filter companies that contain & in thier name
old_companies = old_companies.filter(~col("search_name").contains("&"))
new_companies = new_companies.filter(~col("search_name").contains("&"))

# filter old & new companies according to our definition
old_companies = old_companies.filter(col('founded') >= '1900').filter(col('founded') < '2020')
new_companies = new_companies.filter(col('founded') >= '2020').filter(col('founded') < '2025')

# add meta_industry column
meta_industry = udf(lambda x: meta_industries_12[x] )

old_companies = old_companies.filter(old_companies.industries.isNotNull())
old_companies = old_companies.withColumn('meta_industry', meta_industry(col('industries')))

new_companies = new_companies.filter(new_companies.industries.isNotNull())
new_companies = new_companies.withColumn('meta_industry', meta_industry(col('industries')))

# Features Preprocess

In [0]:
old_companies = old_companies.withColumn('stability', col('stability').cast('float'))

numeric_features = ['stability', 'number_of_similar_companies']
label = ['popularity']

# add number_of_similar_companies
old_companies = add_features(old_companies)

# fill missing values
old_companies = fill_missing_values(old_companies, ['company_size', 'organization_type', 'stability', 'number_of_similar_companies', 'specialties'])

# stability & number_of_similar_companies features
for feature in ['stability', 'number_of_similar_companies']:
    old_companies = old_companies.withColumn(f'{feature}_feature', col(feature).cast("int"))

# company_size feature
old_companies = category_to_index(old_companies, 'company_size', company_size_dict)

# organization_type feature
old_companies = create_one_hot_from_dict(old_companies, 'organization_type', organization_type_dict)

# meta_industry feature
old_companies = create_one_hot_from_dict(old_companies, 'meta_industry', meta_industries_dict)

# specialties feature
old_companies = embedding_text_features(old_companies, old_companies, ['specialties'])

# popularity label
quantiles = [0.3, 0.7]
categories = ["Unpopular", "Neutral Popularity", "Very Popular"]
popularity_quantiles = old_companies.approxQuantile('popularity', quantiles, 0.01)
old_companies = old_companies.withColumn("popularity_category", 
                                         when(old_companies["popularity"] <= popularity_quantiles[0], lit(categories[0]))
                                         .when((old_companies["popularity"] > popularity_quantiles[0]) & (old_companies["popularity"] <= popularity_quantiles[1]), lit(categories[1]))
                                         .otherwise(lit(categories[2])))

# transform label to indexes
old_companies = category_to_index(old_companies, 'popularity_category', label_dict)

# create feature df after pre process   
features_columns = [col for col in old_companies.columns if 'feature' in col]
features_df = old_companies.select(features_columns + ['label', 'popularity', 'popularity_category', 'id'])

# Assemble numeric features into a vector
assembler = VectorAssembler(inputCols=features_columns, outputCol="features")
features_df = assembler.transform(features_df)

features_df.select('id', 'features', 'popularity_category', 'label').limit(5).display()

sent_small_bert_L2_128 download started this may take some time.
Approximate size to download 16.1 MB
[ | ][OK!]


id,features,popularity_category,label
first-texas-homes,"Map(vectorType -> dense, length -> 152, values -> List(6.0, 10.0, 5.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -1.2942283153533936, 0.36948883533477783, 0.4741453230381012, -1.779219627380371, -0.16012081503868103, -0.42470505833625793, -0.5228009223937988, -0.09325098246335983, -1.2865962982177734, -1.3030455112457275, 0.8163958787918091, 0.3656260371208191, 0.1086505651473999, -0.392599880695343, 0.8034310340881348, -0.5854833722114563, -1.7363544702529907, 0.7302411794662476, -1.7386289834976196, 1.4275147914886475, 0.37002477049827576, -0.4076819121837616, 0.738832950592041, 0.5336618423461914, 2.2741668224334717, -0.8179621696472168, 0.7329664826393127, -0.7968835234642029, 0.5774044990539551, -2.2542810440063477, -1.0446972846984863, -1.4977399110794067, -1.3070158958435059, 1.3714115619659424, -0.31685546040534973, -0.19211356341838837, 0.058205995708703995, 0.7681118845939636, -1.2787091732025146, -1.697519302368164, -0.45648184418678284, -0.26102563738822937, 0.24745506048202515, -0.9577698707580566, -0.0667509064078331, -1.1633553504943848, -0.2673433721065521, -0.49115926027297974, 0.5277093052864075, 0.09757150709629059, -0.6518255472183228, 0.7394610643386841, 0.6812499761581421, -0.29619669914245605, -1.3591455221176147, -0.2271805852651596, 1.1283392906188965, 0.24180689454078674, 0.13638624548912048, 1.7329740524291992, -0.4629392921924591, -0.4326169490814209, -0.6684926748275757, -0.02385726571083069, -1.4461091756820679, -1.3901067972183228, 0.17803633213043213, 1.1983410120010376, 1.047681450843811, 0.7665719389915466, -0.23092703521251678, -0.30814558267593384, -0.710768461227417, 0.5776767134666443, -1.6258842945098877, 1.0848305225372314, 0.6421483159065247, -0.013245359063148499, 2.7783584594726562, -0.3264310359954834, -0.6485590934753418, 0.01564553566277027, -0.20050643384456635, -0.33974412083625793, 0.13228704035282135, -0.992781937122345, -1.9872535467147827, 0.4659959673881531, 1.3718551397323608, -1.688912272453308, 1.0149595737457275, 1.3278608322143555, 0.09444236755371094, 1.7074025869369507, 0.42837104201316833, 1.155930519104004, 0.887734591960907, -0.3426534831523895, -0.6012884974479675, 1.3989413976669312, -1.0866906642913818, 0.5543171763420105, 0.30536088347435, 0.3028407096862793, 1.0616294145584106, -1.0390998125076294, -2.4529764652252197, 1.0342470407485962, -0.7142607569694519, -0.9097476005554199, 1.1186363697052002, -0.006872878409922123, -0.3037011921405792, 0.611248254776001, 1.3175405263900757, 0.7859256863594055, -0.12060131877660751, 0.7981509566307068, 0.3981960117816925, 0.41766536235809326, 0.6601080298423767, 0.08885040134191513, 1.4190430641174316, -1.5245566368103027, -1.2806589603424072, -0.8480520844459534, -1.4083153009414673, 1.6311087608337402))",Neutral Popularity,1
the-revolution-hotel,"Map(vectorType -> dense, length -> 152, values -> List(17.0, 10.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.6838653683662415, -0.027924669906497, -0.22845515608787537, -1.8258554935455322, 1.3247026205062866, -0.0275909211486578, 0.08124829083681107, 1.4760819673538208, -1.2548776865005493, -0.5722838640213013, 1.4547723531723022, 0.35236525535583496, 1.299685001373291, -0.4803466200828552, 1.4261395931243896, -0.5757228136062622, 0.3611500859260559, -0.9588012099266052, -0.7471068501472473, 1.7423712015151978, 0.03180694580078125, 1.3310972452163696, 1.5403541326522827, 0.9598807692527771, 1.1077752113342285, -0.5975169539451599, 0.5360548496246338, 0.37211617827415466, 0.09181081503629684, -0.3965241014957428, -0.8349302411079407, -1.803664207458496, -0.8927216529846191, -0.23821167647838593, 0.3768949806690216, -0.03050830587744713, -1.1166585683822632, -0.9257135391235352, -2.385453462600708, -1.6240590810775757, 0.3678542673587799, 0.7329257130622864, 1.3856385946273804, -0.9422370791435242, 0.9021533727645874, -1.164490818977356, -1.3591712713241577, -0.30920323729515076, -1.1834717988967896, -0.18788987398147583, 0.44174307584762573, -0.0246057640761137, 1.2977670431137085, -0.09301803261041641, -0.3938470780849457, -0.023578491061925888, 1.3558632135391235, -0.7119159698486328, 1.7207733392715454, 0.11575998365879059, 0.5779602527618408, -0.28762152791023254, -1.3833038806915283, -0.9048236012458801, -1.8754069805145264, -0.9211238026618958, -0.7229480743408203, -0.05869964882731438, -0.9103507995605469, -0.42490851879119873, -1.0534396171569824, 0.6736226081848145, -0.681755781173706, -0.3189649283885956, -1.0673677921295166, -0.15845945477485657, 0.8337740302085876, -0.7251366376876831, 1.9796903133392334, 0.9489631652832031, 0.7512348890304565, 0.376654714345932, 0.10709802061319351, -0.6243743896484375, 0.1614345759153366, -0.10488037765026093, 0.20808656513690948, 0.36680570244789124, 1.2889000177383423, -2.0143070220947266, 1.0812687873840332, 0.8558918237686157, -0.10656814277172089, 0.9830877780914307, 0.30458682775497437, 0.30069267749786377, -0.248157799243927, 0.1512862890958786, -0.025310160592198372, 0.9727376699447632, 0.09025105834007263, 0.6609427332878113, -0.6286057233810425, -1.5871062278747559, -0.2799057364463806, -0.9293556213378906, -0.6765169501304626, 0.5802470445632935, 0.4514695405960083, -1.1114752292633057, 0.1335330307483673, 0.24122168123722076, 1.9445675611495972, 0.9112147092819214, 0.034502044320106506, -0.2670033574104309, 1.1015764474868774, 0.2335783839225769, -0.17383670806884766, 0.019526217132806778, 0.5696913003921509, -0.1825764924287796, 1.393412470817566, -1.3137884140014648, -1.3129268884658813, -1.8933112621307373, -2.358649969100952, -0.17243404686450958))",Very Popular,2
clipping-way,"Map(vectorType -> dense, length -> 152, values -> List(18.0, 10.0, 3.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0657870769500732, 0.672604501247406, 0.6059471368789673, -0.9316869378089905, -0.2633700370788574, -0.4295305609703064, -0.418547123670578, -0.09572586417198181, -1.4115405082702637, -1.368814468383789, 0.32545095682144165, 0.4408274292945862, 0.3867591917514801, -0.7164695858955383, 0.8642091155052185, -0.33866071701049805, -1.614215612411499, 0.7828115224838257, -2.1249444484710693, 1.2158881425857544, 0.44631385803222656, -0.41395294666290283, 0.8107001185417175, 0.530597448348999, 2.661979913711548, -0.9024122953414917, 0.7903849482536316, -1.2569682598114014, 0.4290461838245392, -2.594094753265381, -1.1226595640182495, -1.4036976099014282, -1.300371527671814, 1.294285774230957, -0.21756798028945923, -0.1461608111858368, -0.616771936416626, 0.8871367573738098, -1.123238444328308, -1.2555490732192993, -0.5453299283981323, 0.11845555901527405, -0.634224534034729, -0.7920389771461487, 0.17201241850852966, -1.0167958736419678, -0.2715860605239868, -0.3758549392223358, 0.7253536581993103, 0.3205451965332031, -0.36118027567863464, 0.7826902270317078, 0.7603801488876343, -0.3021414577960968, -1.6063638925552368, -0.2183382511138916, 0.9238175749778748, 0.1644507646560669, 0.4741608500480652, 1.665771245956421, -0.27933448553085327, -0.3022648096084595, -0.9210357069969177, -0.33188676834106445, -1.643805742263794, -1.3754054307937622, 0.05560414493083954, 1.1702953577041626, 0.4203316271305084, 0.8598961234092712, -0.4275819957256317, -0.33768293261528015, -0.7223255038261414, 0.4748933017253876, -1.8251731395721436, 1.2694200277328491, 0.7457278966903687, 0.1662028729915619, 2.748657703399658, -0.9705743789672852, -0.5998702645301819, -0.35668301582336426, -0.2447892129421234, -0.34246543049812317, 0.43243861198425293, -0.9167554974555969, -1.6784576177597046, 0.5808602571487427, 0.881598949432373, -1.406461238861084, 0.9842459559440613, 1.5092356204986572, 0.46518465876579285, 1.4576480388641357, 0.009851722978055477, 1.3618454933166504, 0.6486940383911133, -0.07132087647914886, -0.4798842966556549, 1.2288076877593994, -1.0235306024551392, 0.6354138851165771, 0.28625059127807617, 0.11589691787958145, 0.7740718722343445, -0.8953534364700317, -2.5839834213256836, 1.012598991394043, -0.7547540664672852, -0.3143003582954407, 0.9825146198272705, 0.12392309308052063, -0.18910729885101318, 0.5664137005805969, 1.2534703016281128, 0.5330533981323242, -0.3655954599380493, 0.9258537292480469, 0.22783595323562622, 0.6565004587173462, 0.8965646624565125, 0.14933013916015625, 1.3480528593063354, -1.4573689699172974, -1.623880386352539, -0.5607595443725586, -1.2912776470184326, 1.8433393239974976))",Neutral Popularity,1
act-lawrence,"Map(vectorType -> dense, length -> 152, values -> List(19.0, 10.0, 2.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.8635786175727844, 1.1982035636901855, -0.9998961687088013, -0.9493464231491089, 0.4741162955760956, -0.31228628754615784, 0.0062552387826144695, 0.20264559984207153, -1.309493899345398, -0.6527342200279236, 1.7901674509048462, 0.9021588563919067, -0.4149102568626404, -0.6957939863204956, 1.2338428497314453, -1.0701971054077148, -0.8396919369697571, -0.6861247420310974, -1.0763828754425049, 1.257245659828186, 0.25033408403396606, -0.26638394594192505, 1.3149397373199463, 1.1945486068725586, 2.0569448471069336, 0.252069354057312, -0.09515583515167236, 0.6777013540267944, -0.571191132068634, 0.35763487219810486, -0.4586675465106964, -1.8206501007080078, 0.353362500667572, -0.011501764878630638, 0.21163783967494965, 0.12320483475923538, -1.0087064504623413, -0.018639296293258667, -2.93876314163208, -0.4357471466064453, 0.27968624234199524, 1.6103523969650269, 0.37931907176971436, -1.0444689989089966, 0.3524743616580963, -1.013667106628418, -0.512079119682312, 0.06950073689222336, -0.6066263914108276, 0.88189297914505, 0.7685914039611816, -0.17155857384204865, -0.08092600852251053, 0.041089896112680435, -0.9923624396324158, 0.009756268002092838, 0.18172651529312134, -0.5583712458610535, 1.14175546169281, 1.8530266284942627, 0.23544536530971527, -0.3493734896183014, -1.729556918144226, -1.7449113130569458, -1.832020878791809, 0.5249091386795044, 0.7103278040885925, 0.5252689123153687, -1.883777141571045, -0.9823024272918701, -0.15683092176914215, 0.7757148146629333, 0.42502081394195557, 0.03879222646355629, -2.302964687347412, -0.3572205901145935, 0.2390107661485672, -0.7210400104522705, 1.516559362411499, 0.6213709712028503, 0.07744327187538147, -0.17606621980667114, 0.3561296761035919, -0.442885160446167, 0.3248755633831024, -0.22948482632637024, -9.571943082846701E-4, -0.27288615703582764, 0.7681565284729004, -2.1063525676727295, 1.4944027662277222, 0.8447633385658264, -0.06619691848754883, 0.4834473431110382, 0.044983264058828354, 0.9015354514122009, -1.1292390823364258, -0.9760763645172119, 0.5329681038856506, 0.5776549577713013, 0.32764261960983276, -0.19394749402999878, -0.5594646334648132, -0.5181366205215454, -0.6005436182022095, -0.23364956676959991, -1.5621873140335083, 0.42433449625968933, 0.48999112844467163, -0.7960500121116638, 0.28828656673431396, 1.2103707790374756, 0.3664824664592743, -0.17554114758968353, 0.7427474856376648, 0.26063790917396545, 1.2843430042266846, 0.7874351739883423, -0.4471711814403534, -0.5068799257278442, 0.87501060962677, 0.7788600325584412, 0.9792263507843018, -0.13941510021686554, -1.4350508451461792, -1.8700424432754517, -1.756768822669983, 1.703673005104065))",Neutral Popularity,1
hobo-the-original,"Map(vectorType -> dense, length -> 152, values -> List(6.0, 10.0, 4.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.300488829612732, 0.10026980936527252, -0.968920886516571, -3.0085232257843018, -0.7005201578140259, 1.0689924955368042, -0.03569268435239792, 0.16040246188640594, -1.4722248315811157, 1.5412938594818115, 0.6054484844207764, -0.5149403214454651, -1.4463502168655396, 0.8001251220703125, 0.8295744061470032, -1.5926387310028076, 0.5132145285606384, 0.5517414212226868, -1.5244301557540894, 0.5979289412498474, 0.1872366964817047, 0.10335689783096313, 1.5700751543045044, 0.9333103895187378, 1.8035707473754883, -0.705447793006897, 0.352856308221817, 0.7786776423454285, -0.23363927006721497, 0.5383838415145874, -1.6852996349334717, -1.8603142499923706, -0.49313363432884216, 0.022417906671762466, 0.7985081672668457, -1.1280465126037598, 0.8617715239524841, -0.6573410630226135, -0.6381500363349915, -1.077297329902649, 1.3298605680465698, -0.9138033390045166, 1.6578999757766724, -0.5650395750999451, -0.37730738520622253, -1.0990816354751587, 0.26160556077957153, -0.21433387696743011, 0.23739883303642273, 0.24058957397937775, -0.672736644744873, 2.7564096450805664, -0.0144498897716403, -0.17757703363895416, 0.12795540690422058, -1.0138577222824097, 0.5958417654037476, 0.5061618089675903, -0.745327889919281, 1.906014323234558, 0.5996077656745911, -1.623801589012146, -1.0908712148666382, 0.3688149154186249, -1.4313615560531616, -0.07143040746450424, 1.0824676752090454, 0.7531430721282959, 1.0919528007507324, -0.0821055918931961, -1.6280126571655273, 0.44762739539146423, -0.14222835004329681, -0.9853484034538269, -0.9987121820449829, -0.21761246025562286, 0.5754649043083191, -0.26493147015571594, 1.0886319875717163, 1.35800302028656, 0.5150272250175476, 0.07716232538223267, 0.24399420619010925, 0.7806764841079712, 0.17618639767169952, 0.32517245411872864, -1.0484615564346313, -0.11412293463945389, 1.7542204856872559, -1.3634830713272095, 2.2367494106292725, -0.33722591400146484, -1.5559594631195068, 1.2742419242858887, 1.2072477340698242, 0.5491828918457031, 0.023287484422326088, -1.1418310403823853, 0.18111251294612885, -0.05583041533827782, -0.12354210764169693, -0.7820083498954773, 0.7935611009597778, -1.2764122486114502, 0.5944556593894958, -1.7054972648620605, -0.5283280611038208, 0.08083140105009079, 0.11858275532722473, -1.6956641674041748, 0.24403521418571472, 1.478808879852295, 1.2420419454574585, -1.046162486076355, 0.24672633409500122, 0.5926902294158936, -1.3192726373672485, -0.36213013529777527, 0.4006057381629944, -0.01347655151039362, -0.37957248091697693, 1.1476341485977173, 1.445464849472046, -2.170302391052246, -1.4877523183822632, -1.594046950340271, -1.2534388303756714, 0.49539855122566223))",Neutral Popularity,1


# Train the model

In [0]:
from pyspark.ml.classification import *
from pyspark.mllib.evaluation import MulticlassMetrics

data = features_df.select("features", "label")
# Split data into training and test sets
(train_data, test_data) = data.randomSplit([0.8, 0.2], seed=87)

model = LogisticRegression(labelCol="label", featuresCol="features")
model_name = 'Logistic Regression'
print(f"Training {model_name}...")
trained_model = model.fit(train_data)

predictions = trained_model.transform(test_data)
    
# Calculate accuracy
accuracy_evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)
print(f"Accuracy for {model_name}: {accuracy}")

# Calculate F1
f1_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol='prediction', metricName='weightedFMeasure')
f1 = f1_evaluator.evaluate(predictions)
print(f"F1 for {model_name}: {f1}")

# Calculate confusion matrix
predictionAndLabels = predictions.select("prediction", "label").rdd.map(lambda x: (float(x[0]), float(x[1])))
metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()
print(f"Confusion matrix for {model_name}:\n{confusion_matrix}")

Training Logistic Regression...
Accuracy for Logistic Regression: 0.49886415265788275
F1 for Logistic Regression: 0.49406166064867985
Confusion matrix for Logistic Regression:
[[381. 180. 115.]
 [201. 465. 157.]
 [145. 305. 252.]]


# New_companies features preprocess

In [0]:
predicted_stability_df = spark.read.csv("/dbfs/FileStore/shared_uploads/omriitzhaki@campus.technion.ac.il/predicted_stability_df", header=True)
predicted_stability_df = predicted_stability_df.withColumnRenamed("id", "new_company_id")

new_companies = new_companies.join(predicted_stability_df, new_companies['id'] == predicted_stability_df['new_company_id'], 'inner')

In [0]:
# add number_of_similar_companies
new_companies = add_features(new_companies)

# fill missing values
new_companies = fill_missing_values(new_companies, ['company_size', 'organization_type', 'predicted_stability', 'number_of_similar_companies'])

# predicted_stability & number_of_similar_companies features
for feature in ['predicted_stability', 'number_of_similar_companies']:
    new_companies = new_companies.withColumn(f'{feature}_feature', col(feature).cast("int"))

# company_size feature
new_companies = category_to_index(new_companies, 'company_size', company_size_dict)

# organization_type feature
new_companies = create_one_hot_from_dict(new_companies, 'organization_type', organization_type_dict)

# meta_industry features
new_companies = create_one_hot_from_dict(new_companies, 'meta_industry', meta_industries_dict)

# specialties features
new_companies = embedding_text_features(old_companies, new_companies, ['specialties'])

# create feature df after pre process   
features_columns = [col for col in new_companies.columns if 'feature' in col]
assembler = VectorAssembler(inputCols=features_columns, outputCol="features")
new_companies = assembler.transform(new_companies)

new_companies.select('id', 'features').limit(5).display()

sent_small_bert_L2_128 download started this may take some time.
Approximate size to download 16.1 MB
[ | ][OK!]


id,features
28-research-inc,"Map(vectorType -> dense, length -> 152, values -> List(7.0, 10.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.064393162727356, 0.7143968343734741, 0.5241509079933167, -0.6406130194664001, -0.4820651710033417, -0.31723058223724365, -0.3947852551937103, -0.3411380648612976, -1.3921515941619873, -1.5019888877868652, 0.2138400673866272, 0.9520403742790222, 0.5377432703971863, -0.5264187455177307, 0.9428620934486389, -0.2981928884983063, -1.5826939344406128, 0.9418190717697144, -2.1611194610595703, 1.4397075176239014, 0.35751813650131226, -0.5008982419967651, 0.7560628652572632, 0.9848535656929016, 2.29754638671875, -0.9613281488418579, 0.6152336597442627, -1.2238579988479614, 0.2039899230003357, -2.390124559402466, -1.2434908151626587, -1.7995260953903198, -1.3016014099121094, 1.5023844242095947, -0.2026672065258026, -0.45338794589042664, -0.5167436003684998, 0.962124764919281, -1.303858757019043, -1.257920503616333, -0.4217424690723419, 0.17379410564899445, -0.8672763705253601, -0.9983976483345032, 0.07962798327207565, -1.113377332687378, -0.49949967861175537, -0.337821364402771, 0.12731625139713287, 0.3744203448295593, -0.32887905836105347, 0.7664983868598938, 0.8250643014907837, -0.4037265479564667, -1.6031090021133423, -0.039046160876750946, 0.9913279414176941, 0.28838279843330383, 0.5828094482421875, 1.5232658386230469, -0.4500977694988251, -0.3883506953716278, -0.7256516218185425, -0.023224465548992157, -1.3720935583114624, -1.3057318925857544, 0.2295370101928711, 1.0049102306365967, 0.16994470357894897, 0.9507079124450684, -0.4002135396003723, -0.4196605682373047, -0.5182173848152161, 0.4381556808948517, -1.7948323488235474, 1.0571529865264893, 0.4924606382846832, 0.017973685637116432, 2.4169158935546875, -0.9335283637046814, -0.4382433295249939, -0.515625, -0.20875521004199982, -0.22134999930858612, 0.7794089317321777, -0.9638808965682983, -1.8447508811950684, 0.7349623441696167, 0.6650119423866272, -1.1374192237854004, 1.399522066116333, 1.3147553205490112, 0.11778252571821213, 1.6046675443649292, 0.14102590084075928, 1.1515684127807617, 0.7832249402999878, -0.15692463517189026, -0.5074137449264526, 1.4322956800460815, -0.6361590027809143, 0.9738180637359619, 0.29875659942626953, 0.09705051779747009, 1.2342935800552368, -1.0303230285644531, -2.724984645843506, 1.3756096363067627, -0.7404832243919373, -0.18294018507003784, 0.8243940472602844, 0.05158580467104912, -0.08523862063884735, 0.5111402869224548, 0.7751591205596924, 0.46444055438041687, -0.23535938560962677, 0.9081387519836426, 0.35983845591545105, 0.42357540130615234, 0.9399452805519104, 0.3212769627571106, 1.2495023012161255, -1.3127309083938599, -1.6020816564559937, -0.5511237382888794, -1.5033149719238281, 1.3086556196212769))"
3bulldogs,"Map(vectorType -> dense, length -> 152, values -> List(6.0, 7.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0880430936813354, 0.9748651385307312, -1.6383212804794312, -2.8524060249328613, -0.7699288129806519, 0.34203633666038513, -0.11622174829244614, 0.33084622025489807, -1.3546890020370483, 1.4995778799057007, 1.3680124282836914, -0.24632102251052856, -2.2074813842773438, -0.045035574585199356, 1.1248810291290283, -1.528263807296753, -0.4127964377403259, 0.14344491064548492, -2.2425012588500977, 1.319778561592102, 0.03117082081735134, -0.22839972376823425, 2.3721423149108887, 1.7022351026535034, 0.8859752416610718, 0.09486593306064606, 0.5028294324874878, 0.39916419982910156, 0.19746647775173187, 0.7801249027252197, -1.1860527992248535, -1.2983670234680176, -0.5965096950531006, -0.03278372064232826, 1.1925708055496216, -0.7110944986343384, 0.6308732032775879, -1.0647984743118286, -1.4240450859069824, -1.4944943189620972, 1.2783164978027344, -0.5234233736991882, 1.5809653997421265, -0.5956752300262451, -0.0438213013112545, -1.3817330598831177, -0.35168176889419556, -0.7288061380386353, 0.13635402917861938, -0.11179183423519135, -0.0346454493701458, 1.7446058988571167, 0.052011583000421524, -0.7585667371749878, 0.9512828588485718, -1.549593448638916, 0.4671468734741211, 1.0285524129867554, -0.9658661484718323, 1.7479695081710815, 1.144322395324707, -1.1514984369277954, -0.7399274706840515, 0.37894874811172485, -0.8895686864852905, -0.036426741629838943, 0.3495360314846039, 0.4954082667827606, 1.3361049890518188, 0.05465412512421608, -0.24866391718387604, 0.5219517350196838, -0.4209551513195038, -0.49579620361328125, -0.6320165395736694, 0.12240076810121536, 0.030097099021077156, 0.048711247742176056, 0.9925776124000549, 1.464788794517517, -0.1913171261548996, 0.3343353569507599, -0.3499641418457031, 0.07155995070934296, 0.07163169980049133, 0.4213002026081085, -0.9910388588905334, -0.9477798342704773, 2.0224716663360596, -1.9665484428405762, 1.7128878831863403, 0.5475678443908691, -1.9565513134002686, 1.3964498043060303, 1.1701126098632812, 0.9551424980163574, 0.5753591060638428, -0.993553102016449, -0.06384933739900589, 0.32841140031814575, -0.6641756296157837, -0.22112040221691132, 0.2555719017982483, -1.1907258033752441, 0.7551881074905396, -1.784416913986206, -1.5583851337432861, 0.7174734473228455, 0.32014843821525574, -1.767082691192627, -0.4264117479324341, 1.2929893732070923, 0.15330059826374054, -0.1489235758781433, 0.6886001825332642, 1.360609769821167, -0.6331226825714111, -0.4651247262954712, 0.4654417932033539, 0.1115146204829216, -0.3872726261615753, 1.0906171798706055, 1.153778076171875, -1.3542052507400513, -1.4917635917663574, -1.517329454421997, -1.3166322708129883, -0.42787879705429077))"
80-20ai,"Map(vectorType -> dense, length -> 152, values -> List(9.0, 10.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, -1.4328221082687378, 0.3101200461387634, 0.31897324323654175, -2.17964506149292, -0.17997559905052185, -0.31348100304603577, -0.4698832333087921, 0.03402703255414963, -1.1901644468307495, -1.0344538688659668, 0.9596722722053528, 0.45014098286628723, -0.15635567903518677, -0.31248947978019714, 0.9423739314079285, -0.7765796184539795, -1.710033893585205, 0.6983188986778259, -1.7383286952972412, 1.427777647972107, 0.386130690574646, -0.4265947937965393, 0.8729337453842163, 0.7269687056541443, 2.1100785732269287, -0.834663987159729, 0.7380581498146057, -0.6350969076156616, 0.7232962250709534, -1.974746823310852, -1.0256693363189697, -1.6305142641067505, -1.2537604570388794, 1.1928833723068237, -0.17346304655075073, -0.27466869354248047, 0.23127791285514832, 0.6191603541374207, -1.3887873888015747, -1.7524644136428833, -0.33602672815322876, -0.5348588824272156, 0.6765314340591431, -1.0507757663726807, -0.16278935968875885, -1.0714504718780518, -0.2795238792896271, -0.5197085738182068, 0.40939000248908997, 0.12402006983757019, -0.7204042077064514, 0.8436415791511536, 0.6447697877883911, -0.302852600812912, -1.1444517374038696, -0.3705819845199585, 1.0765159130096436, 0.2931816577911377, -0.153423473238945, 1.6620631217956543, -0.3033711612224579, -0.5808911919593811, -0.7351488471031189, -0.03908117115497589, -1.3694039583206177, -1.345217227935791, 0.2625097930431366, 1.208558201789856, 1.1393873691558838, 0.6113081574440002, -0.1560322344303131, -0.22713878750801086, -0.8103300333023071, 0.39239805936813354, -1.4184571504592896, 0.9514626264572144, 0.4411861002445221, 0.008668443188071251, 2.645331859588623, -0.018636684864759445, -0.5484481453895569, -0.033642470836639404, -0.20022433996200562, -0.29512524604797363, 0.0660066157579422, -1.0284864902496338, -2.0845961570739746, 0.4835417866706848, 1.686158537864685, -1.7922292947769165, 1.1608014106750488, 1.22713041305542, -0.14649330079555511, 1.807393193244934, 0.5620013475418091, 1.1685009002685547, 1.0057300329208374, -0.4188541769981384, -0.6056436896324158, 1.4038898944854736, -1.0899361371994019, 0.48662295937538147, 0.2840862572193146, 0.26085755228996277, 1.2365357875823975, -1.1487665176391602, -2.305699586868286, 1.0204801559448242, -0.610020101070404, -1.1058413982391357, 1.0506080389022827, 0.13847120106220245, -0.2666890621185303, 0.3651489019393921, 1.3634488582611084, 1.0114030838012695, 0.059521596878767014, 0.6735498905181885, 0.49055737257003784, 0.33717456459999084, 0.4537421464920044, 0.20582127571105957, 1.380193829536438, -1.604827880859375, -1.2939932346343994, -0.9802761673927307, -1.4439786672592163, 1.5208220481872559))"
a1vendingcompany,"Map(vectorType -> dense, length -> 152, values -> List(10.0, 0.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0880430936813354, 0.9748651385307312, -1.6383212804794312, -2.8524060249328613, -0.7699288129806519, 0.34203633666038513, -0.11622174829244614, 0.33084622025489807, -1.3546890020370483, 1.4995778799057007, 1.3680124282836914, -0.24632102251052856, -2.2074813842773438, -0.045035574585199356, 1.1248810291290283, -1.528263807296753, -0.4127964377403259, 0.14344491064548492, -2.2425012588500977, 1.319778561592102, 0.03117082081735134, -0.22839972376823425, 2.3721423149108887, 1.7022351026535034, 0.8859752416610718, 0.09486593306064606, 0.5028294324874878, 0.39916419982910156, 0.19746647775173187, 0.7801249027252197, -1.1860527992248535, -1.2983670234680176, -0.5965096950531006, -0.03278372064232826, 1.1925708055496216, -0.7110944986343384, 0.6308732032775879, -1.0647984743118286, -1.4240450859069824, -1.4944943189620972, 1.2783164978027344, -0.5234233736991882, 1.5809653997421265, -0.5956752300262451, -0.0438213013112545, -1.3817330598831177, -0.35168176889419556, -0.7288061380386353, 0.13635402917861938, -0.11179183423519135, -0.0346454493701458, 1.7446058988571167, 0.052011583000421524, -0.7585667371749878, 0.9512828588485718, -1.549593448638916, 0.4671468734741211, 1.0285524129867554, -0.9658661484718323, 1.7479695081710815, 1.144322395324707, -1.1514984369277954, -0.7399274706840515, 0.37894874811172485, -0.8895686864852905, -0.036426741629838943, 0.3495360314846039, 0.4954082667827606, 1.3361049890518188, 0.05465412512421608, -0.24866391718387604, 0.5219517350196838, -0.4209551513195038, -0.49579620361328125, -0.6320165395736694, 0.12240076810121536, 0.030097099021077156, 0.048711247742176056, 0.9925776124000549, 1.464788794517517, -0.1913171261548996, 0.3343353569507599, -0.3499641418457031, 0.07155995070934296, 0.07163169980049133, 0.4213002026081085, -0.9910388588905334, -0.9477798342704773, 2.0224716663360596, -1.9665484428405762, 1.7128878831863403, 0.5475678443908691, -1.9565513134002686, 1.3964498043060303, 1.1701126098632812, 0.9551424980163574, 0.5753591060638428, -0.993553102016449, -0.06384933739900589, 0.32841140031814575, -0.6641756296157837, -0.22112040221691132, 0.2555719017982483, -1.1907258033752441, 0.7551881074905396, -1.784416913986206, -1.5583851337432861, 0.7174734473228455, 0.32014843821525574, -1.767082691192627, -0.4264117479324341, 1.2929893732070923, 0.15330059826374054, -0.1489235758781433, 0.6886001825332642, 1.360609769821167, -0.6331226825714111, -0.4651247262954712, 0.4654417932033539, 0.1115146204829216, -0.3872726261615753, 1.0906171798706055, 1.153778076171875, -1.3542052507400513, -1.4917635917663574, -1.517329454421997, -1.3166322708129883, -0.42787879705429077))"
acquireinvest,"Map(vectorType -> dense, length -> 152, values -> List(7.0, 10.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -1.0760215520858765, 0.810135543346405, 0.43888071179389954, -0.5114666223526001, -0.46472838521003723, -0.3699313998222351, -0.23785631358623505, -0.17564129829406738, -1.34842050075531, -1.41431725025177, 0.4110310673713684, 0.951924741268158, 0.6780173778533936, -0.566907525062561, 0.790629506111145, -0.7145159244537354, -1.4638843536376953, 0.6798458695411682, -1.9752562046051025, 1.4099549055099487, 0.7404004335403442, -0.2715577483177185, 0.8783643841743469, 0.7769179940223694, 2.4676642417907715, -0.8461028337478638, 0.6456700563430786, -1.1115233898162842, 0.25514212250709534, -2.37601637840271, -1.1417033672332764, -2.1213505268096924, -1.0524858236312866, 1.2969635725021362, -0.32875922322273254, -0.22183293104171753, -0.7935081124305725, 0.8745093941688538, -1.4448142051696777, -1.249571681022644, -0.4392249882221222, 0.49381589889526367, -0.7035412192344666, -0.9530357718467712, 0.22420842945575714, -0.9485825896263123, -0.5232062935829163, -0.5515403151512146, -0.04287499934434891, 0.39219799637794495, -0.28163906931877136, 0.8325587511062622, 0.7426681518554688, -0.3007405996322632, -1.538993000984192, -0.17401312291622162, 1.0098586082458496, -0.13511410355567932, 0.924457848072052, 1.6885247230529785, -0.12174022197723389, -0.1797284483909607, -1.0804375410079956, -0.651276171207428, -1.2452454566955566, -1.131127953529358, 0.08642978966236115, 1.2325785160064697, -0.5281150341033936, 0.7984940409660339, -0.2802892029285431, 0.1326921284198761, -0.6166444420814514, 0.34875813126564026, -1.6534137725830078, 1.039779543876648, 0.5904862284660339, 0.027106724679470062, 2.7132349014282227, -0.7065028548240662, -0.4461170434951782, -0.5602391958236694, -0.30229976773262024, -0.4775552451610565, 0.7786594033241272, -0.9817125201225281, -1.5700746774673462, 0.8301824331283569, 0.6277998089790344, -1.1789536476135254, 1.1453242301940918, 1.065293312072754, 0.31241974234580994, 1.1406102180480957, 0.014753096736967564, 1.2653071880340576, 0.7346631288528442, -0.2232830971479416, -0.10918287187814713, 1.293509602546692, -0.6024278402328491, 0.8934301137924194, 0.06127866730093956, -0.09593893587589264, 0.9788932204246521, -0.9699020385742188, -2.4596312046051025, 0.9185928106307983, -0.6895471811294556, -0.3189510107040405, 0.6499835848808289, 0.22785495221614838, 0.022041747346520424, 0.7550154328346252, 0.9978602528572083, 0.401589035987854, 6.313830963335931E-4, 0.8109200596809387, 0.15828633308410645, 0.31235426664352417, 1.2486960887908936, 0.3848695158958435, 1.2441867589950562, -1.2387882471084595, -1.8527177572250366, -0.759900689125061, -1.7372918128967285, 1.399350643157959))"


# New_companies Predict Popularity 

In [0]:
preds = trained_model.transform(new_companies)
preds = index_to_category(preds, 'prediction', 'predicted_popularity', label_dict)
preds.select('id', 'predicted_popularity', 'prediction').limit(10).display()

id,predicted_popularity,prediction
redwood-services-llc,Neutral Popularity,1.0
bunnystudio,Very Popular,2.0
splitappstore,Very Popular,2.0
test-banks,Very Popular,2.0
workshoplocal,Neutral Popularity,1.0
eucalyptushealthcare,Very Popular,2.0
shopclairvaux,Very Popular,2.0
theculturexyz,Very Popular,2.0
in-the-worx,Neutral Popularity,1.0
vita-health,Neutral Popularity,1.0


In [0]:
preds.select('id', 'predicted_popularity').write.csv("/dbfs/FileStore/shared_uploads/omriitzhaki@campus.technion.ac.il/predicted_popularity_df", header=True)