## Description
This Notebook takes a model from SBERT and predefined mobility keywords. The wordvectors from the mobility keywords are compared to the wordvectors from the german titles. If the similarity is over 50% the dataset is labelled as mobility-data. 

#### Result
This approach seems to fit pretty good for our purpose. From first sight the results are correct, even that the keywords are defined by ChatGPT and not edited yet. So we will take it from here and build on that to improve it even more. There is still the question of how the wordvectors and similarity can be visualised.

The code was created with the assistance of ChatGPT-4.

In [1]:
# Install libraries (only for Colab):
# !pip install sentence-transformers -q
# !pip install scikit-learn -q

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load the model (automatically downloaded if not already available locally)
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define mobility words pool (read from file)
def load_mobility_keywords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        keywords = [line.strip() for line in file if line.strip()]  # Ignore empty lines
    return keywords

# Load the mobility terms from the file
mobility_keywords_file = 'data/mobility_keywords_pool.txt'
mobility_keywords = load_mobility_keywords(mobility_keywords_file)

# mobility_keywords_pool.txt:
# Mobilität
# Verkehr
# Transport
# Fahrt
# Auto
# Fahrrad
# Öffentlicher Nahverkehr
# Pendeln
# Reisen
# Bus
# Bahn
# Verkehrsmittel
# Fahrzeug
# Mobilitätskonzept
# E-Scooter
# Flugzeug
# Taxi
# Schiff
# Mobilitätsplattform
# Verkehrsdaten
# Verkehrsinfrastruktur

In [3]:
# Convert keywords to vectors
keyword_embeddings = model.encode(mobility_keywords)

In [4]:
# Function for similarity checking
def check_similarity(text, keyword_embeddings, model, threshold=0.5):
    text_embedding = model.encode([text])
    similarities = cosine_similarity(text_embedding, keyword_embeddings)
    max_similarity = np.max(similarities)
    return max_similarity >= threshold, max_similarity

In [5]:
import pandas as pd
import os

# Function to analyze the dataset
def analyze_dataset(file_path, keyword_embeddings, model, threshold=0.5):
    # Check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Load the file
    df = pd.read_csv(file_path)
    
    # Check if the data was loaded correctly
    if df.empty:
        raise ValueError("The file is empty or was not loaded correctly.")
    
    # New columns for analysis results
    labels = []
    scores = []

    for description in df.iloc[:, 0]:  # Iterate over each row of the first column
        is_mobility, similarity_score = check_similarity(str(description), keyword_embeddings, model, threshold)
        label = 'Mobility Data' if is_mobility else 'Not Mobility Data'
        labels.append(label)
        scores.append(similarity_score)

    # Save the results in new columns
    df['Analysis Result'] = labels
    df['Similarity Score'] = scores

    # Save the updated file in the same folder as the input file
    output_file = os.path.join(os.path.dirname(file_path), 'analysed_datasets.csv')
    df.to_csv(output_file, index=False)
    
    print(f"Analysis completed. File saved at: {output_file}")
    return output_file

# paths
input_file_path = 'data/random_lines.txt' 

# Analyze the file (modify this line with your model & embeddings)
output_file_path = analyze_dataset(input_file_path, keyword_embeddings, model)


Analysis completed. File saved at: data\analysed_datasets.csv
