In [4]:
from sentence_transformers import SentenceTransformer, models
import numpy as np
import pandas as pd
from os.path import abspath
import os

In [5]:
# Load transcripts

transcripts = pd.read_csv("transcript/transcript.csv")

In [6]:
transcripts

Unnamed: 0,Name,Gender,Transcript
0,P_001_Day_2_Hour_16,f,Du musst dann da noch vom Raf.. Raffi… Schak....
1,P_001_Day_2_Hour_18,f,Danke. Ist jetzt bei dir noch immer nichts gek...
2,P_001_Day_2_Hour_9,f,"Hast du geschaut, die Navigation, auf dem N..."
3,P_001_Day_3_Hour_16,f,"Man könnte ja, aber du. Nicht, nein. Die komme..."
4,P_001_Day_3_Hour_17,f,"Das ist höher oben. Mhm ist ja hier, geknecht..."
...,...,...,...
374,Z_012_Day_1_Hour_9,f,"Ja. Das ist schon doof, mit dieser Frisur hier..."
375,Z_013_Day_4_Hour_21,m,Du kannst . Ich muss schauen wegen meinter M...
376,Z_013_Day_3_Hour_21,m,Ich habe deine Socken alle gewendet um einen d...
377,Z_013_Day_2_Hour_21,m,schon gemerkt. Im Örlikon getroffen. Weil S...


In [7]:
def extract_lexical_features_bert(data):
    
    print ('Extracting linguistic features')
    print ('Data shape' + str(data.shape))

    sentences = data

    # Get German BERT model
    model = SentenceTransformer('bert-base-german-cased')
    model.max_seq_length = 512

    # Get sentence embeddings
    sentence_embeddings = model.encode(sentences)

    features = np.array(sentence_embeddings)
    return features

In [8]:
# Extract embeddings

text = transcripts['Transcript']
sentence_embeddings = extract_lexical_features_bert(text)


Extracting linguistic features
Data shape(379,)


Some weights of the model checkpoint at C:\Users\madha/.cache\torch\sentence_transformers\bert-base-german-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
sentence_embeddings.shape

(379, 768)

In [10]:
# Assign column names to the length of the vector

feature_columns = ['Col ' + str(i) for i in range(len(sentence_embeddings[0]))]
meta_columns = ['Name','Gender']
columns = meta_columns + feature_columns

linguistic_features = np.c_[transcripts['Name'].tolist(), transcripts['Gender'].tolist(), sentence_embeddings]

text_embeddings = pd.DataFrame(linguistic_features, columns = columns)

final_text_embeddings = text_embeddings#.set_index(transcripts['Name'])


In [11]:
final_text_embeddings.to_csv("transcript/linguistic_features.csv",index=False)

In [12]:
final_text_embeddings

Unnamed: 0,Name,Gender,Col 0,Col 1,Col 2,Col 3,Col 4,Col 5,Col 6,Col 7,...,Col 758,Col 759,Col 760,Col 761,Col 762,Col 763,Col 764,Col 765,Col 766,Col 767
0,P_001_Day_2_Hour_16,f,-0.10934576,0.51448447,0.58788335,0.47795188,-0.23530589,0.52904135,-0.043852426,-0.39632523,...,0.32933986,-0.04352364,0.14901556,-0.034356922,0.37425494,0.2776391,-0.37457928,0.17784938,-0.7587422,-0.32693955
1,P_001_Day_2_Hour_18,f,0.0919982,0.15951025,0.11538646,-0.54148126,0.038503084,0.10742389,-0.25317532,-0.11884922,...,0.19106993,-0.40968338,-0.04548048,-0.06473017,0.6153734,0.2636888,-0.5928029,0.06417591,-0.20346835,-0.18393043
2,P_001_Day_2_Hour_9,f,0.16676906,0.19802737,0.23412278,-0.34148905,0.18960989,0.36567077,-0.30855578,-0.15476553,...,-0.2939485,-0.41491106,0.10752336,0.024196172,0.64118785,0.17112447,-0.45588988,0.20262608,-0.07689462,-0.2102066
3,P_001_Day_3_Hour_16,f,0.073364526,0.2771481,0.030253548,-0.22401893,0.093036994,0.28969422,-0.31331837,-0.09366622,...,0.0761941,-0.569562,0.42742562,-0.08653099,0.6035017,0.2525213,-0.64217377,0.20318565,-0.27892232,-0.17743134
4,P_001_Day_3_Hour_17,f,-0.09212922,0.40236706,0.13525411,-0.14321545,0.17308985,0.14524534,-0.29179662,-0.30770937,...,0.23374303,-0.24998431,0.07988383,0.056445077,0.7823024,0.06274462,-0.52613175,-0.16181064,-0.59683293,-0.113389604
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,Z_012_Day_1_Hour_9,f,-0.068548664,0.598269,0.19514273,-0.06294471,0.022161322,0.2207472,-0.15480214,-0.21393627,...,-0.05183615,-0.189059,-0.051379,0.16382639,0.6861117,0.07948355,-0.47620484,0.19794281,-0.5342317,-0.10934121
375,Z_013_Day_4_Hour_21,m,-0.16369197,0.25652868,0.23037142,0.03939024,-0.11214603,0.24176893,-0.27653292,-0.5165737,...,-0.054700494,-0.129562,0.07826996,0.07480139,0.55342937,0.1312222,-0.66246516,-0.12628378,-0.44106007,0.20845033
376,Z_013_Day_3_Hour_21,m,-0.15620093,0.19891244,0.107177906,-0.33841166,-0.2519372,0.17334884,-0.38203114,-0.094052985,...,0.12500043,-0.0698659,-0.25454795,-0.41095716,0.6094153,0.21186256,-0.3433055,-0.19645132,-0.2880487,0.3413718
377,Z_013_Day_2_Hour_21,m,0.023204628,0.4558903,0.2342447,-0.060161863,0.0005846197,-0.02846886,-0.27697232,-0.40715563,...,0.22314075,-0.2204492,-0.11217599,0.14325716,0.56962436,0.34949574,-0.6150134,0.22193314,-0.5226695,-0.14676979
