In [1]:
import pandas as pd
import numpy as np
import re
from IPython.display import display

## NZZ CS DATA PREP

#### import and merge data

In [2]:
# import nzz cs data

file_paths = [
    "Data/NZZ/nzz_scraped_data_CS_correction.csv"
]

# List to store DataFrames
dfs = []

# Read each CSV file into a DataFrame and append to the list
for file_path in file_paths:
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
nzz_data_CS_merged = pd.concat(dfs, ignore_index=True)

In [3]:
len(nzz_data_CS_merged)

7

In [4]:
nzz_data_CS_merged.head()

Unnamed: 0,article_title,headline,article_content,text_date
0,Familienunternehmen sind erfolgreicher,Familienfirmen schneiden an der Börse besser a...,Familienunternehmen und die Börse – das sind n...,12.01.2019
1,Die Konkurrenz sitzt im Ausland,Der Schweizer Detailhandel ist in zunehmendem ...,Der Schweizer Einzelhandel ist im vergangenen ...,08.01.2019
2,Weitere Hürden für Amazon in der Schweiz,Amazon wird laut einer Studie der Credit Suiss...,(awp/sda) · Seit knapp einem Jahr ist Amazon a...,08.01.2019
3,Wechselkursentwicklung: Optimistische Prognos...,Mit einer zur Stärke neigenden Heimwährung dro...,In einem exportorientierten Land wie der Schwe...,08.01.2019
4,Die Schweizer Banken müssen besser werden,Die Schweizer Banken sind in Geschäftsbereiche...,Die Realität lässt keinen Raum für Zweifel: Di...,03.01.2019


#### remove duplicates

In [5]:
# Check for duplicates in the merged DataFrame, keeping all occurrences
all_occurrences_indices = nzz_data_CS_merged[nzz_data_CS_merged.duplicated(keep=False)].index

# Display the indices of all occurrences of duplicate rows
print("Indices of All Occurrences of Duplicate Rows:")
print(all_occurrences_indices)

Indices of All Occurrences of Duplicate Rows:
Index([], dtype='int64')


In [6]:
duplicate_indices_specific_columns = nzz_data_CS_merged[nzz_data_CS_merged.duplicated(subset=['article_content'], keep=False)].index
print("Indices of Duplicate Rows Based on Specific Columns:")
print(duplicate_indices_specific_columns)

Indices of Duplicate Rows Based on Specific Columns:
Index([], dtype='int64')


In [7]:
# Remove duplicate rows based on specific columns
nzz_data_CS_merged_no_dups = nzz_data_CS_merged.drop_duplicates(subset=['article_content'], keep='first')

In [9]:
print(len(nzz_data_CS_merged_no_dups))
print(len(nzz_data_CS_merged))

7
7


In [39]:
#nzz_data_CS_merged.loc[126, "article_content"]

#### Filter by date

In [10]:
# Create a copy of the DataFrames
nzz_data_CS_merged_no_dups_copy = nzz_data_CS_merged_no_dups.copy()

# Convert 'text_date' column to datetime type, considering timezone information
nzz_data_CS_merged_no_dups_copy['text_date'] = pd.to_datetime(nzz_data_CS_merged_no_dups_copy['text_date'], utc=True, format='%d.%m.%Y', errors='coerce')

# Define the start and end dates of the desired range
start_date = pd.Timestamp('2019-01-01', tz='UTC')
end_date = pd.Timestamp('2023-06-30', tz='UTC')

# Filter the DataFrame to include only articles within the specified date range
filtered_df_nzz_cs = nzz_data_CS_merged_no_dups_copy[(nzz_data_CS_merged_no_dups_copy['text_date'] >= start_date) & (nzz_data_CS_merged_no_dups_copy['text_date'] <= end_date)]

In [11]:
len(filtered_df_nzz_cs)

7

#### Filter by relevancy

##### look for variants of Credit Suisse

In [12]:
filtered_df_nzz_cs_copy = filtered_df_nzz_cs.copy()

# Create a boolean Series where each element is True if the corresponding article contains 'credit suisse' or 'CREDIT SUISSE'

contains_lower = filtered_df_nzz_cs_copy['article_content'].str.contains('Credit-Suisse', case=True)
contains_upper = filtered_df_nzz_cs_copy['article_content'].str.contains('credit suisse', case=True)

# Combine the two Series using the bitwise OR operator

contains_either = contains_lower | contains_upper

# Check if any articles contain 'credit suisse' or 'CREDIT SUISSE'

any_contains_either = contains_either.any()

print(f"Any articles contain 'credit suisse' or 'CREDIT SUISSE'? {any_contains_either}")

num_contains_either = contains_either.sum()

print(f"Number of articles that contain 'credit suisse' or 'CREDIT SUISSE': {num_contains_either}")

Any articles contain 'credit suisse' or 'CREDIT SUISSE'? False
Number of articles that contain 'credit suisse' or 'CREDIT SUISSE': 0


##### new code

In [13]:
# Create a copy of the filtered DataFrame
filtered_df_nzz_cs_copy = filtered_df_nzz_cs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
def count_credit_suisse(text):
    return len(re.findall(r'credit[-\s]*suisse', text, re.IGNORECASE))
    
filtered_df_nzz_cs_copy['credit_suisse_count'] = filtered_df_nzz_cs_copy['article_content'].apply(count_credit_suisse)

def count_cs(text):
    return len(re.findall(r'(?<![a-zA-Z])CS(?![a-zA-Z])', text))

filtered_df_nzz_cs_copy['cs_count'] = filtered_df_nzz_cs_copy['article_content'].apply(count_cs)

# Filter articles based on multiple conditions
nzz_credit_suisse_articles = filtered_df_nzz_cs_copy[
    (filtered_df_nzz_cs_copy['credit_suisse_count'] >= 2) |
    (filtered_df_nzz_cs_copy['cs_count'] >= 2) |
    ((filtered_df_nzz_cs_copy['credit_suisse_count'] >= 1) & (filtered_df_nzz_cs_copy['cs_count'] >= 1))
]


##### old code

In [103]:
# Create a copy of the filtered DataFrame
#filtered_df_nzz_cs_copy = filtered_df_nzz_cs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
#filtered_df_nzz_cs_copy['credit_suisse_count'] = filtered_df_nzz_cs_copy['article_content'].str.count('Credit Suisse')

#def count_cs(text):
#    return len(re.findall(r'CS(?![a-zA-Z])', text))

#filtered_df_nzz_cs_copy['cs_count'] = filtered_df_nzz_cs_copy['article_content'].apply(count_cs)

# Filter articles based on multiple conditions
#nzz_credit_suisse_articles = filtered_df_nzz_cs_copy[
#    (filtered_df_nzz_cs_copy['credit_suisse_count'] >= 2) |
#    (filtered_df_nzz_cs_copy['cs_count'] >= 2) |
#    ((filtered_df_nzz_cs_copy['credit_suisse_count'] >= 1) & (filtered_df_nzz_cs_copy['cs_count'] >= 1))
#]


In [14]:
len(nzz_credit_suisse_articles)

6

In [104]:
#len(nzz_credit_suisse_articles) #output for old method -> so 3 more articles removed.

1025

In [16]:
# Calculate the word count for each article in the credit_suisse_articles DataFrame
nzz_credit_suisse_articles.loc[:, 'word_count'] = nzz_credit_suisse_articles['article_content'].str.split().apply(len)

In [18]:
# List of article titles to exclude
#exclude_titles = [
    " Briefing am Montagmorgen",
   " Briefing am Montagabend",
    " Briefing am Dienstagmorgen",
    " Briefing am Dienstagabend",
    " Briefing am Mittwochmorgen",
    " Briefing am Mittwochabend",
    " Briefing am Donnerstagmorgen",
    " Briefing am Donnerstagabend",
    " Briefing am Freitagmorgen",
    " Briefing am Freitagabend",
    " Briefing am Wochenende",
    " Briefing"
]

# Filter the DataFrame to exclude articles with specified titles
#nzz_credit_suisse_articles_no_briefing = nzz_credit_suisse_articles[~nzz_credit_suisse_articles['article_title'].isin(exclude_titles)]

IndentationError: unindent does not match any outer indentation level (<string>, line 4)

In [19]:
# Sort the articles by date in ascending order
nzz_credit_suisse_articles_sorted = nzz_credit_suisse_articles.sort_values(by='text_date')

In [20]:
# remove over 5k words
# Filter out articles with a word count higher than 5000
nzz_credit_suisse_articles_sorted = nzz_credit_suisse_articles_sorted[nzz_credit_suisse_articles_sorted['word_count'] <= 5000]

In [21]:
print(len(nzz_credit_suisse_articles_sorted))
len(nzz_credit_suisse_articles)

6


6

In [110]:
print(len(nzz_credit_suisse_articles_sorted))
len(nzz_credit_suisse_articles)

899


1025

In [24]:
# Export the sorted DataFrame to a CSV file
nzz_credit_suisse_articles_sorted.to_csv('Prepped_Data/CS_NZZ_prepped_correction.csv', index=False)

In [23]:
nzz_credit_suisse_articles_sorted.head()

Unnamed: 0,article_title,headline,article_content,text_date,credit_suisse_count,cs_count,word_count
4,Die Schweizer Banken müssen besser werden,Die Schweizer Banken sind in Geschäftsbereiche...,Die Realität lässt keinen Raum für Zweifel: Di...,2019-01-03 00:00:00+00:00,1,5,1160
5,Die USA klagen in der Moçambique-Affäre drei ...,Bisher sind fünf Individuen im Zusammenhang mi...,Am Donnerstag ist in den USA eine Strafklage i...,2019-01-04 00:00:00+00:00,6,6,661
1,Die Konkurrenz sitzt im Ausland,Der Schweizer Detailhandel ist in zunehmendem ...,Der Schweizer Einzelhandel ist im vergangenen ...,2019-01-08 00:00:00+00:00,1,4,587
2,Weitere Hürden für Amazon in der Schweiz,Amazon wird laut einer Studie der Credit Suiss...,(awp/sda) · Seit knapp einem Jahr ist Amazon a...,2019-01-08 00:00:00+00:00,3,1,433
3,Wechselkursentwicklung: Optimistische Prognos...,Mit einer zur Stärke neigenden Heimwährung dro...,In einem exportorientierten Land wie der Schwe...,2019-01-08 00:00:00+00:00,1,2,261


### NZZ UBS DATA

In [25]:
# import nzz cs data

file_paths = [
    "Data/NZZ/nzz_scraped_data_UBS_correction.csv"
]

# List to store DataFrames
dfs = []

# Read each CSV file into a DataFrame and append to the list
for file_path in file_paths:
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
nzz_data_UBS_merged = pd.concat(dfs, ignore_index=True)

In [26]:
len(nzz_data_UBS_merged)

38

In [27]:
nzz_data_UBS_merged.head()

Unnamed: 0,article_title,headline,article_content,text_date
0,Nachruf auf einen Vielgeschmähten: Einst bewe...,Der jahrelange Todeskampf des ehemals wichtigs...,Es ist ein Abgang ohne Fanfaren. Am Freitag ha...,30.06.2023
1,Simulation mit begrenzter Realität – Wall-Str...,Die grössten Finanzinstitute der Wall Street h...,Bankpleiten hin oder her: Glaubt man der ameri...,29.06.2023
2,Abschottung des EU-Binnenmarkts: Der Schweize...,Die Kommission wollte die grenzüberschreitende...,In der Schweizer Bankenwelt geht es gegenwärti...,29.06.2023
3,Briefing am Mittwochabend,Alexandre Fasel ist der neue EU-Chefunterhändl...,Das Wichtigste am Abend: Die Wahl des neuen St...,28.06.2023
4,Trotz Energiewende: Hersteller von Windrädern...,Die Zeit der Zulassungsrekorde bei den Windkra...,Je mehr in Europa wegen des Klimawandels um Wi...,28.06.2023


In [28]:
# Check for duplicates in the merged DataFrame, keeping all occurrences
all_occurrences_indices = nzz_data_UBS_merged[nzz_data_UBS_merged.duplicated(keep=False)].index

# Display the indices of all occurrences of duplicate rows
print("Indices of All Occurrences of Duplicate Rows:")
print(all_occurrences_indices)

Indices of All Occurrences of Duplicate Rows:
Index([], dtype='int64')


In [43]:
duplicate_indices_specific_columns = nzz_data_UBS_merged[nzz_data_UBS_merged.duplicated(subset=['headline'], keep=False)].index
print("Indices of Duplicate Rows Based on Specific Columns:")
print(duplicate_indices_specific_columns)

Indices of Duplicate Rows Based on Specific Columns:
Index([], dtype='int64')


In [30]:
# Remove duplicate rows based on specific columns
nzz_data_UBS_merged_no_dups = nzz_data_UBS_merged.drop_duplicates(subset=['article_content'], keep='first')

In [31]:
print(len(nzz_data_UBS_merged_no_dups))
print(len(nzz_data_UBS_merged))

38
38


In [81]:
#nzz_data_UBS_merged.loc[2360, "article_content"]

In [32]:
# Create a copy of the DataFrame
nzz_data_UBS_merged_no_dups_copy = nzz_data_UBS_merged_no_dups.copy()

# Convert 'text_date' column to datetime type, considering timezone information and specifying the date format
nzz_data_UBS_merged_no_dups_copy['text_date'] = pd.to_datetime(nzz_data_UBS_merged_no_dups_copy['text_date'], utc=True, format='%d.%m.%Y', errors='coerce')

# Define the start and end dates of the desired range
start_date = pd.Timestamp('2019-01-01', tz='UTC')
end_date = pd.Timestamp('2023-06-30', tz='UTC')

# Filter the DataFrame to include only articles within the specified date range
filtered_df_nzz_ubs = nzz_data_UBS_merged_no_dups_copy[(nzz_data_UBS_merged_no_dups_copy['text_date'] >= start_date) & (nzz_data_UBS_merged_no_dups_copy['text_date'] <= end_date)]


In [33]:
len(filtered_df_nzz_ubs)

38

#### filter by relevancy

##### old code

In [49]:
# Create a copy of the filtered DataFrame
#filtered_df_nzz_ubs_copy = filtered_df_nzz_ubs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
#filtered_df_nzz_ubs_copy['UBS_count'] = filtered_df_nzz_ubs_copy['article_content'].str.count('UBS')

# Filter articles containing 'Credit Suisse' at least twice in their content
#nzz_UBS_articles = filtered_df_nzz_ubs_copy[filtered_df_nzz_ubs_copy['UBS_count'] >= 2]

##### new code

In [34]:
# Create a copy of the filtered DataFrame
filtered_df_nzz_ubs_copy = filtered_df_nzz_ubs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
def count_ubs(text):
    return len(re.findall(r'(?<![a-zA-Z])UBS(?![a-zA-Z])', text))
filtered_df_nzz_ubs_copy['UBS_count'] = filtered_df_nzz_ubs_copy['article_content'].apply(count_ubs)

# Filter articles containing 'Credit Suisse' at least twice in their content
nzz_UBS_articles = filtered_df_nzz_ubs_copy[filtered_df_nzz_ubs_copy['UBS_count'] >= 2]

In [91]:
# Export the sorted DataFrame to a CSV file
#filtered_df_nzz_ubs_copy.to_csv('UBS_NZZ_unfiltered.csv', index=False)

923

In [35]:
len(nzz_UBS_articles)

18

In [37]:
# Calculate the word count for each article in the credit_suisse_articles DataFrame
nzz_UBS_articles.loc[:, 'word_count'] = nzz_UBS_articles['article_content'].str.split().apply(len)

In [66]:
# List of article titles to exclude
#exclude_titles = [
#    " Briefing am Montagmorgen",
    " Briefing am Montagabend",
    " Briefing am Dienstagmorgen",
    " Briefing am Dienstagabend",
    " Briefing am Mittwochmorgen",
    " Briefing am Mittwochabend",
    " Briefing am Donnerstagmorgen",
    " Briefing am Donnerstagabend",
    " Briefing am Freitagmorgen",
    " Briefing am Freitagabend",
    " Briefing am Wochenende",
    " Briefing"
]

# Filter the DataFrame to exclude articles with specified titles
#nzz_UBS_articles_no_briefing = nzz_UBS_articles[~nzz_UBS_articles['article_title'].isin(exclude_titles)]

In [38]:
# Sort the articles by date in ascending order
nzz_UBS_articles_sorted = nzz_UBS_articles.sort_values(by='text_date')

In [39]:
# remove over 5k words
# Filter out articles with a word count higher than 5000
nzz_UBS_articles_sorted = nzz_UBS_articles_sorted[nzz_UBS_articles_sorted['word_count'] <= 5000]

In [40]:
# Export the sorted DataFrame to a CSV file
nzz_UBS_articles_sorted.to_csv('Prepped_Data/UBS_NZZ_prepped_correction.csv', index=False)

In [41]:
len(nzz_UBS_articles_sorted)

18

In [42]:
nzz_UBS_articles_sorted

Unnamed: 0,article_title,headline,article_content,text_date,UBS_count,word_count
35,Plötzlich weg: Zehn Jahre lang hat Bracken Da...,Unter seiner Führung ist Logitech mehr als dop...,"«Ich mache das hier, solange es Sinn ergibt. U...",2023-06-14 00:00:00+00:00,2,652
32,"Die Finma hat den Auftrag, Bankenkrisen zu ve...",Die Fixierung der Finanzmarktaufseher auf regu...,Immer wieder hat die Eidgenössische Finanzmark...,2023-06-15 00:00:00+00:00,6,565
34,"Beisshemmung, miserable Kommunikation, falsch...",Die Aufsichtsbehörde konnte den immer schnelle...,Dreissig Jahre an der Wall Street brühen ab. D...,2023-06-15 00:00:00+00:00,3,1690
27,Nicht nur die PUK – auch die UBS sollte Ursac...,"Es wäre ein starkes Signal, wenn er den UBS-Ve...","Wenn sich Bankenkrisen ereignen, nehmen die Fa...",2023-06-17 00:00:00+00:00,8,659
26,"Gegenwind in den USA, Rückenwind in Europa","Der Chef von Bitcoin Suisse erklärt, was die K...",Es ist eine Kriegserklärung an die Kryptobranc...,2023-06-17 00:00:00+00:00,2,1076
25,Die Rolle von Ulrich Körner wirft Fragen auf:...,Am Montag hat die UBS die CS geschluckt. Frühm...,Die UBS gibt bei der CS den Tarif durch. Am Mo...,2023-06-17 00:00:00+00:00,27,1415
24,Die Rolle von Ulrich Körner wirft Fragen auf:...,Am Montag hat die UBS die CS geschluckt. Frühm...,Am Montag wurde die Übernahme der CS durch die...,2023-06-17 00:00:00+00:00,27,1411
21,Wie geht es weiter mit der Inflation in der S...,Die Schweizerische Nationalbank (SNB) gibt die...,Mit Sicherheit weiss man das zwar nie. Doch un...,2023-06-21 00:00:00+00:00,3,781
20,Privatbanken-Poker: Wer wird die Nummer zwei ...,Die Übernahme der Credit Suisse hat Diskussion...,"Die Tränen, die am Schweizer Finanzplatz wegen...",2023-06-21 00:00:00+00:00,3,1035
19,Nach dem CS-Debakel blüht der Markt für AT1-A...,Die umstrittene Abschreibung nachrangiger Bond...,Die Credit Suisse ist seit vergangener Woche T...,2023-06-22 00:00:00+00:00,10,899


848