In [12]:
import pandas as pd
import numpy as np
data = pd.read_csv('preprocessed_data.csv')
data

Unnamed: 0,idea,deal,pitcher_ask_amount,ask_valuation,deal_amount,deal_valuation
0,Frozen Momos,1,50.0,1000.00,75.0,468.75
1,Renting e-bike for mobility in private spaces,1,40.0,266.67,40.0,80.00
2,Detachable Sleeves,1,25.0,250.00,25.0,83.33
3,Healthy Potato Chips,1,70.0,7000.00,70.0,2545.45
4,Disposable Urine Bag,1,75.0,1875.00,75.0,1250.00
...,...,...,...,...,...,...
60,Insoles,1,40.0,400.00,40.0,160.00
61,Sportswear,1,40.0,2000.00,60.0,600.00
62,VR,1,35.0,3500.00,10.0,100.00
63,Bacon Jams,1,40.0,400.00,40.0,200.00


Going ahead with NLP, using BERT

In [13]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from sklearn.decomposition import PCA

In [14]:
# load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = TFBertModel.from_pretrained(model_name)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
# define a function to encode a single text string using BERT
def encode_text(text):
    # tokenize the text and convert to input features
    input_ids = tokenizer.encode_plus(
        text, 
        add_special_tokens=True,
        return_attention_mask=True,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        max_length=128
    )['input_ids']
    
    # convert to a tensor and get the output features from the BERT model
    input_ids = tf.convert_to_tensor([input_ids])
    outputs = model(input_ids)
    features = outputs.last_hidden_state.numpy()[0][0] # get the final hidden state of the [CLS] token
    
    return features

In [16]:
# encode the text using the BERT model
data['bert_features'] = data['idea'].apply(encode_text)

In [17]:
data

Unnamed: 0,idea,deal,pitcher_ask_amount,ask_valuation,deal_amount,deal_valuation,bert_features
0,Frozen Momos,1,50.0,1000.00,75.0,468.75,"[-0.40300342, 0.4658886, 0.5529136, 0.5306726,..."
1,Renting e-bike for mobility in private spaces,1,40.0,266.67,40.0,80.00,"[-0.54163396, 0.27552167, 0.43653956, 0.616361..."
2,Detachable Sleeves,1,25.0,250.00,25.0,83.33,"[-0.41590324, 0.41704273, 0.5447339, 0.4631031..."
3,Healthy Potato Chips,1,70.0,7000.00,70.0,2545.45,"[-0.45363718, 0.513287, 0.5972315, 0.52046263,..."
4,Disposable Urine Bag,1,75.0,1875.00,75.0,1250.00,"[-0.479359, 0.45520285, 0.5314713, 0.5108121, ..."
...,...,...,...,...,...,...,...
60,Insoles,1,40.0,400.00,40.0,160.00,"[-0.44151, 0.54855096, 0.529613, 0.555401, 0.0..."
61,Sportswear,1,40.0,2000.00,60.0,600.00,"[-0.44579557, 0.4920467, 0.49788994, 0.5775838..."
62,VR,1,35.0,3500.00,10.0,100.00,"[-0.5140173, 0.44801217, 0.5294316, 0.49346206..."
63,Bacon Jams,1,40.0,400.00,40.0,200.00,"[-0.46665272, 0.42570952, 0.5205811, 0.5862598..."


In [18]:
pca = PCA(n_components=2)
data['bert_features_pca'] = None
data['bert_features_pca'] = pca.fit_transform(data['bert_features'].tolist())

print(data)

                                             idea  deal  pitcher_ask_amount  \
0                                    Frozen Momos     1                50.0   
1   Renting e-bike for mobility in private spaces     1                40.0   
2                              Detachable Sleeves     1                25.0   
3                            Healthy Potato Chips     1                70.0   
4                            Disposable Urine Bag     1                75.0   
..                                            ...   ...                 ...   
60                                        Insoles     1                40.0   
61                                     Sportswear     1                40.0   
62                                             VR     1                35.0   
63                                     Bacon Jams     1                40.0   
64                                       Lemonade     1                40.0   

    ask_valuation  deal_amount  deal_valuation  \
0

In [24]:
df = pd.DataFrame(data)


In [30]:
df = df.drop(['bert_features'], axis=1)

In [31]:
df.to_csv('nlp_model.csv')
