In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import spacy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import re
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
import gensim
from sklearn.decomposition import TruncatedSVD
from collections import Counter

In [2]:
# Read the dataset 
df = pd.read_excel('labeled_data_whole.xlsx')

# Convert text to lowercase
df['text'] = df['text'].str.lower()

# Remove special characters and numbers
df['text'] = df['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# Print the preprocessed dataset
df

Unnamed: 0.1,Unnamed: 0,file_name,text,Sales,filtered_text
0,0,SOM_118_00055_FA163EF3DB12-1b43-e3615700-3dca9...,brot urlaub ihre mein name wie kann ich weiter...,0,
1,1,SOM_TPF_00101_FA163E22F5B7-1af3-de819700-4197c...,einen schnen guten tag lauditag du stehst ja m...,0,lauditag stehst
2,2,SOM_141_00085_FA163E622DEB-1b45-68d0e700-3c9a0...,guten tag und herzlich willkommen bei o sie sp...,0,eta kumpel
3,3,SOM_LUB_00112_FA163E22F5B7-1af3-dc815700-45c46...,hallo herzlich willkommen bei der service sie ...,0,bert bislang
4,4,SOM_AMV_00100_FA163E153AE3-1bb7-3f731700-3fe36...,hallo schnen guten tag mein name die firma ott...,0,
...,...,...,...,...,...
1993,395,SOM_LUB_00246_FA163EF3DB12-1b43-e3615700-3be4f...,herzlich willkommen bei dem otto service sie s...,0,pending eingeschränkt eingeschränkt vorteilen
1994,396,SOM_VYD_00630_FA163ED88855-1bef-b9321700-412dc...,herzlich willkommen bei o mein name wie kann i...,0,bahnhofstraße bahnhof bahnhofstraße ttt inhous...
1995,397,SOM_LUB_00379_FA163E622DEB-1b45-6850d700-3e961...,herzlich willkommen bei o sie sprechen mit ihr...,0,rücksetzung aufgehängt rücksetzung frühen gene...
1996,398,SOM_TPF_00515_FA163E56E95C-1b1d-4929d700-3aab3...,herzlich willkommen bei blau ich werde am buck...,1,mso


In [11]:
df['text'][0]

'brot urlaub ihre mein name wie kann ich weiterhelfen \n  ja sind sie bei mir leider verkehrt ich leit direkt an die kollegen weiter nicht auflegen \n  das wre optimal ja dann gebe ich den kollegen das weiter \n  jetzt leite sie weiter ja nicht auflegen bitte tschau'

In [3]:
df['text'][1]

'einen schnen guten tag lauditag du stehst ja mein name wie kann ich ihnen helfen\n ja\n o k geht es um den rufnummer am ende\n sie wollen diese prepaid karte registrieren richtig\n also bei der persnliche kundenkennzahl knnen sie eine stellige nummer hinterlegen was sie wollen\n einfach eine stellige nummer ja\n ja haben sie vielleicht weitere fragen\n o k\n nee nee nee das ist so sie selbst hinterlegen o k dann wenn sie keine weitere fragen haben wnsche ich ihnen das war leicht schnen tag noch\n tschss'

In [3]:
# Load the spaCy German language model
nlp = spacy.load('de_core_news_sm')
# Create an empty list to store the updated text without proper names
updated_texts = []
# Iterate over the text column and remove proper names using spaCy
for text in df['text']:
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.pos_ != 'PROPN':
            tokens.append(token.text)
    updated_texts.append(' '.join(tokens))
# Update the 'text' column with the modified text
df['text'] = updated_texts

# Print the updated dataset
df.head()

Unnamed: 0.1,Unnamed: 0,file_name,text,Sales,filtered_text
0,0,SOM_118_00055_FA163EF3DB12-1b43-e3615700-3dca9...,brot urlaub ihre mein name wie kann ich weiter...,0,
1,1,SOM_TPF_00101_FA163E22F5B7-1af3-de819700-4197c...,einen schnen guten tag lauditag du stehst ja m...,0,lauditag stehst
2,2,SOM_141_00085_FA163E622DEB-1b45-68d0e700-3c9a0...,guten tag und herzlich willkommen bei sie spre...,0,eta kumpel
3,3,SOM_LUB_00112_FA163E22F5B7-1af3-dc815700-45c46...,herzlich willkommen bei der service sie sprech...,0,bert bislang
4,4,SOM_AMV_00100_FA163E153AE3-1bb7-3f731700-3fe36...,hallo schnen guten tag mein name die firma was...,0,


In [4]:
sales_1 = df[df['Sales'] == 1]['text']
sales_0 = df[df['Sales'] == 0]['text']


In [5]:
df_sales_1 = pd.DataFrame({'text': sales_1})
df_sales_0 = pd.DataFrame({'text': sales_0})

In [27]:
total_sales_text = len(df_sales_1)
total_non_sales_text = len(df_sales_0)
total = len(df['text'] )
dist_for_sales = total_sales_text/ total
dist_for_non_sales = total_non_sales_text/ total

print('sales:',dist_for_sales* 100)
print('Non sales:',dist_for_non_sales* 100)

sales: 23.223223223223226
Non sales: 76.77677677677679


In [31]:
# Tokenize and create a dictionary
# Topic for text
texts = [el.split() for el in list(df["text"].to_numpy())]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Perform LDA
num_topics = 4
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

# Extract topic distributions and labels
topic_distributions = [lda_model.get_document_topics(text) for text in corpus]
topic_labels = ["Verkauf", "Dienstleistung", "Beschwerden", "Vertrag"]

# Calculate percentage distribution
total_documents_text = len(topic_distributions)
topic_counts = Counter([topic for doc_topics in topic_distributions for topic, _ in doc_topics])
topic_percentages = {label: (count / total_documents_text) * 100 for label, count in topic_counts.items()}

# Print the percentage distribution for each topic
for label, percentage in topic_percentages.items():
    topic_label = topic_labels[label]
    print(f"{topic_label.capitalize()} topics: {percentage:.2f}%")


Verkauf topics: 89.39%
Vertrag topics: 99.05%
Dienstleistung topics: 78.73%
Beschwerden topics: 95.65%


In [34]:
# Topic for sales category
# Tokenize and create a dictionary
texts = [el.split() for el in list(df_sales_1["text"].to_numpy())]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Perform LDA
num_topics = 4
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

# Extract topic distributions and labels
topic_distributions = [lda_model.get_document_topics(df_sales_1) for df_sales_1 in corpus]
topic_labels = ["Verkauf", "Dienstleistung", "Beschwerden", "Vertrag"]

# Calculate percentage distribution
total_documents = len(topic_distributions)
topic_counts = Counter([topic for doc_topics in topic_distributions for topic, _ in doc_topics])
topic_percentages = {label: (count / total_documents_text) * 100 for label, count in topic_counts.items()}

# Print the percentage distribution for each topic
for label, percentage in topic_percentages.items():
    topic_label = topic_labels[label]  # Get the corresponding label
    print(f"{topic_label.capitalize()} topics: {percentage:.2f}%")


Verkauf topics: 16.52%
Dienstleistung topics: 22.57%
Vertrag topics: 23.07%
Beschwerden topics: 22.32%


In [33]:
# Topic for Non sales category
# Tokenize and create a dictionary
# texts = [df_sales_0.split() for df_sales_0 in df_sales_0]
texts = [el.split() for el in list(df_sales_0["text"].to_numpy())]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Perform LDA
num_topics = 4
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

# Extract topic distributions and labels
topic_distributions = [lda_model.get_document_topics(df_sales_0) for df_sales_0 in corpus]
topic_labels = ["Verkauf", "Dienstleistung", "Beschwerden", "Vertrag"]

# Calculate percentage distribution
total_documents = len(topic_distributions)
topic_counts = Counter([topic for doc_topics in topic_distributions for topic, _ in doc_topics])
topic_percentages = {label: (count / total_documents_text) * 100 for label, count in topic_counts.items()}

# Print the percentage distribution for each topic
for label, percentage in topic_percentages.items():
    topic_label = topic_labels[label]  # Get the corresponding label
    print(f"{topic_label.capitalize()} topics: {percentage:.2f}%")

Verkauf topics: 70.62%
Beschwerden topics: 50.65%
Vertrag topics: 75.58%
Dienstleistung topics: 4.10%


In [14]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [15]:
# Preprocessing steps (you can customize this based on your requirements)
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = text.lower().replace('[^\w\s]', '')
    # Remove stopwords (you may need to download the stopwords list)
    stopwords = set(stopwords.words('german'))
    text = ' '.join(word for word in text.split() if word not in stopwords)
    return text


 <h5> Contextualized Topic Models </h5>