# Current Topics and Projects
## Sentiment Analysis - Dictionary approach 

In [1]:
'''
This file is the 7th to be runned.
Here we use a dictionary approach to identify comments that mention known politicians
'''

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from multiprocessing import Pool, cpu_count
import time
from tqdm import tqdm
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import ast

In [2]:
#!pip install fuzzywuzzy 
#!pip install python-Levenshtein

In [3]:
# getting dictionary of politicians
politicians = pd.read_csv('data/btw21_gewaehlte-fortschreibung_utf8.csv', 
                   skiprows=4, header=4, encoding='utf-8', delimiter=';')

politicians

Unnamed: 0,Wahlart,Wahltag,Titel,Namenszusatz,Nachname,Vornamen,Künstlername,Geschlecht,Geburtsjahr,PLZ,...,VerknGebietsname,VerknGebietLandAbk,VerknGruppenname,VerknListenplatz,VorpGewaehlt,WahltagGewaehlt,BeginnMitgliedschaftDatum,VerlustMitgliedschaftDatum,VerlustMitgliedschaftGrund,ListennachfolgeBekanntmachungsNr
0,BT,26.09.2021,,,Abdi,Sanae,,w,1986,50678,...,Nordrhein-Westfalen,NW,SPD,40.0,,X,26.10.2021,,,
1,BT,26.09.2021,,,Abel,Valentin Christian,,m,1991,74214,...,Schwäbisch Hall – Hohenlohe,BW,FDP,,,X,26.10.2021,,,
2,BT,26.09.2021,,,Abraham,Knut Friedrich Alexander,,m,1966,4916,...,Elbe-Elster – Oberspreewald-Lausitz II,BB,CDU,,,X,26.10.2021,,,
3,BT,26.09.2021,,,Adler,Katja,,w,1974,61440,...,Hochtaunus,HE,FDP,,,X,26.10.2021,,,
4,BT,26.09.2021,,,Aeffner,Stephanie,,w,1976,69214,...,Pforzheim,BW,GRÜNE,,,X,26.10.2021,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
746,BT,26.09.2021,,,Mende,Dirk-Ulrich August Heinrich,,m,1957,29225,...,Celle – Uelzen,NI,SPD,,,,25.01.2023,,,11.0
747,BT,26.09.2021,,,Bernstein,Melanie,,w,1976,23812,...,Plön – Neumünster,SH,CDU,,X,,06.02.2023,,,12.0
748,BT,26.09.2021,,,Föhr,Alexander Paul,,m,1980,69118,...,Heidelberg,BW,CDU,,,,01.03.2023,,,13.0
749,BT,26.09.2021,Dr.,,Rothfuß,Rainer,,m,1971,88131,...,Oberallgäu,BY,AfD,,,,02.03.2023,,,14.0


In [4]:
# load YouTube data 
df_com = pd.read_csv('data/comments_final.csv')
df_com['clean_tokens'] = df_com['clean_tokens'].apply(ast.literal_eval)

In [6]:
def normalize_text(text):
    # Convert to lower case
    text = text.lower()
    text = text.replace(':', ' ')
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    return text


In [7]:
'''
preprocess politicians data 
'''
names = list(politicians["Nachname"])

# Applying the functions to the list of names
normalized_names = [normalize_text(name) for name in names]

updated_names = []
alternatives = {'ä': 'ae', 'ö': 'oe', 'ü': 'ue'}
alternatives2 = {'ae': 'ä',  'oe': 'ö', 'ue': 'ü'}
for i in normalized_names:
    x = i.lower()
    updated_names.append(x)
    for char, alt in alternatives.items():
        if char in x:
            j = x.replace(char, alt)
            updated_names.append(j)
                
    for char, alt in alternatives2.items():
        if char in x:
            j = x.replace(char, alt)
            updated_names.append(j)

# Function to generate alternative names
def generate_alternatives(name):
    alternatives = {
        'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
        'ae': 'ä', 'oe': 'ö', 'ue': 'ü'
    }
    alt_names = [name.lower()]
    
    for char, alt in alternatives.items():
        if char in name:
            alt_name = name.replace(char, alt)
            alt_names.append(alt_name.lower())
            
    return alt_names

# Convert names to lowercase and create duplicate rows with alternative names
new_rows = []
for index, row in politicians.iterrows():
    name = row["Nachname"]
    alternative_names = generate_alternatives(name)
    
    for alt_name in alternative_names:
        new_row = row.copy()
        new_row["Nachname"] = alt_name
        new_rows.append(new_row)

# Create a new DataFrame with lowercase names and alternative names
new_politicians = pd.DataFrame(new_rows)

#duplicate özdemir
print(new_politicians.loc[734])
new_politicians.drop(index=734, inplace=True)

#delete all politican names which have common used multiple meanings
delete = ['alt','baum',"junge","weiss","meister","menge", "braun", "michel",'busen','ernst', 'frei', 'grau', 'grund', "höchst", "hoechst", "jung", "kasper", 'klein', 'kopf', "kraft", 'lieb', 'reichel', 'tausend', "stein", 'vogel', 'zorn', "schön", "schoen"]

print(len(delete))

len(updated_names)

updated_names = [item for item in updated_names if item not in delete]

len(updated_names)

#print(list(set(error_list)))
error = {
    'krampkarrenbauer': "kramp-karrenbauer",
    'harderkühnel': 'harder-kühnel',
    'widmannmauz': 'widmann-mauz',
    'starkwatzinger': "stark-watzinger",
    'oezdemir': "özdemir",
    'strackzimmermann': 'strack-zimmermann',
    'göringeckardt':  'göring-eckardt',
    'kappertgonther': "kappert-gonther"
}

In [24]:
'''
find politicians and create personal attach variable
'''

attack_flags = []
attack_names = []
attack_genders = []
attack_parties = []
error_list = []

for _, row in tqdm(df_com.iterrows(), total=len(df_com)):
    tokens = row["clean_tokens"]
    personal_attack_flag = 0
    attack_name = []
    attack_gender = []
    attack_party = []
    
    for token in tokens:   
        if token in updated_names:
            matched_politician = new_politicians.loc[new_politicians['Nachname'] == token]
            if matched_politician.empty:
                correct = error[token]
                matched_politician = new_politicians.loc[new_politicians['Nachname'] == correct]
                if matched_politician.empty :
                    error_list.append(correct)
                personal_attack_flag = 1
                attack_name.append(str(matched_politician['Vornamen'].iloc[0] + " " + matched_politician['Nachname'].iloc[0]))
                attack_gender.append(str(matched_politician['Geschlecht'].iloc[0]))
                attack_party.append(str(matched_politician['VerknGruppenname'].iloc[0]))
                
            else:
                personal_attack_flag = 1
                attack_name.append(str(matched_politician['Vornamen'].iloc[0] + " " + matched_politician['Nachname'].iloc[0]))
                attack_gender.append(str(matched_politician['Geschlecht'].iloc[0]))
                attack_party.append(str(matched_politician['VerknGruppenname'].iloc[0]))

    
    attack_flags.append(personal_attack_flag)
    attack_names.append(attack_name)
    attack_genders.append(attack_gender)
    attack_parties.append(attack_party)

# Assign the collected values to the DataFrame
df_com['personal_attack'] = attack_flags
df_com['personal_attack_name'] = attack_names
df_com['personal_attack_gender'] = attack_genders
df_com['personal_attack_party'] = attack_parties

100%|██████████| 134883/134883 [00:34<00:00, 3856.81it/s]


In [25]:
df_com.to_csv('data/comments_final.csv', index=False)