In [1]:
import pandas as pd
import numpy as np
import re
from IPython.display import display

In [19]:
# import srf cs data

file_paths = [
    "Data/SRF/scraped_data_CS_april.csv",
    "Data/SRF/scraped_data_1000_credit_suisse.csv"
]

# List to store DataFrames
dfs = []

# Read each CSV file into a DataFrame and append to the list
for file_path in file_paths:
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
df_srf_cs = pd.concat(dfs, ignore_index=True)

In [34]:
# import SRF CS data
#file_path = "Data/SRF/scraped_data_CS_april.csv"
#df_srf_cs = pd.read_csv(file_path)

In [20]:
df_srf_cs.head()

Unnamed: 0,article_title,merged_string,text_date
0,Ex-CS-Chefs dürfen Boni behalten,Die ehemaligen Führungskräfte der untergegange...,2024-04-14T10:35:00+02:00
1,Finma will mehr Kompetenzen – und höhere Anfor...,Die Finanzmarktaufsicht Finma fordert stärkere...,2024-03-20T09:32:00+01:00
2,CS-Krise hatte Einfluss auf die Wahlen und das...,Finanzministerin Karin Keller-Sutter wurde von...,2024-03-19T18:53:00+01:00
3,Credit Suisse und Kantonalbanken: Wer profitie...,Profitieren die Kantonalbanken vom CS-Aus? Gem...,2024-03-19T05:44:00+01:00
4,Das Quiz zur Übernahme der CS durch die UBS,Der 19. März 2023 wurde zum Schicksalstag für ...,2024-03-18T14:53:00+01:00


In [21]:
len(df_srf_cs)

1959

In [22]:
#df_srf_cs.to_csv('SRF URLS/df_srf_cs_merged.csv', index=False)

799

#### remove dups

In [10]:
# Check for duplicates in the merged DataFrame, keeping all occurrences
all_occurrences_indices = df_srf_cs[df_srf_cs.duplicated(keep=False)].index

# Display the indices of all occurrences of duplicate rows
print("Indices of All Occurrences of Duplicate Rows:")
print(all_occurrences_indices)

Indices of All Occurrences of Duplicate Rows:
Index([ 366,  393,  444,  914, 1152, 1153, 1154, 1155, 1241, 1245, 1269, 1270,
       1316, 1472, 1473, 1563, 1672, 1709, 1837, 1904, 1906, 1907],
      dtype='int64')


In [23]:
duplicate_indices_specific_columns = df_srf_cs[df_srf_cs.duplicated(subset=['article_title'], keep=False)].index
print("Indices of Duplicate Rows Based on Specific Columns:")
print(duplicate_indices_specific_columns)

Indices of Duplicate Rows Based on Specific Columns:
Index([  37,   40,   44,   45,   46,   50,   52,   53,   54,   55,
       ...
       1912, 1915, 1919, 1920, 1922, 1929, 1938, 1943, 1951, 1954],
      dtype='int64', length=889)


In [24]:
# Remove duplicate rows based on specific columns --> No duplicates found
df_srf_cs_no_dups = df_srf_cs.drop_duplicates(subset=['article_title'], keep='first')

In [25]:
df_srf_cs = df_srf_cs_no_dups

#### filter by time

In [26]:
# Convert 'text_date' column to datetime type, considering timezone information
df_srf_cs['text_date'] = pd.to_datetime(df_srf_cs['text_date'], utc=True)

# Define the start and end dates of the desired range
start_date = pd.Timestamp('2019-01-01', tz='UTC')
end_date = pd.Timestamp('2023-06-30', tz='UTC')

# Filter the DataFrame to include only articles within the specified date range
filtered_df_srf_cs = df_srf_cs[(df_srf_cs['text_date'] >= start_date) & (df_srf_cs['text_date'] <= end_date)]

In [27]:
len(filtered_df_srf_cs)

544

#### filter by relevancy

In [39]:
filtered_df_srf_cs_copy = filtered_df_srf_cs.copy()

# Create a boolean Series where each element is True if the corresponding article contains 'credit suisse' or 'CREDIT SUISSE'

contains_lower = filtered_df_srf_cs_copy['merged_string'].str.contains('CREDIT SUISSE', case=True)
contains_upper = filtered_df_srf_cs_copy['merged_string'].str.contains('Credit suisse', case=True)

# Combine the two Series using the bitwise OR operator

contains_either = contains_lower | contains_upper

# Check if any articles contain 'credit suisse' or 'CREDIT SUISSE'

any_contains_either = contains_either.any()

print(f"Any articles contain 'credit suisse' or 'CREDIT SUISSE'? {any_contains_either}")

num_contains_either = contains_either.sum()

print(f"Number of articles that contain 'credit suisse' or 'CREDIT SUISSE': {num_contains_either}")

Any articles contain 'credit suisse' or 'CREDIT SUISSE'? False
Number of articles that contain 'credit suisse' or 'CREDIT SUISSE': 0


In [40]:
# Create a copy of the filtered DataFrame
filtered_df_srf_cs_copy = filtered_df_srf_cs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
def count_credit_suisse(text):
    return len(re.findall(r'credit[-\s]*suisse', text, re.IGNORECASE))
    
filtered_df_srf_cs_copy['credit_suisse_count'] = filtered_df_srf_cs_copy['merged_string'].apply(count_credit_suisse)

def count_cs(text):
    return len(re.findall(r'(?<![a-zA-Z])CS(?![a-zA-Z])', text))

filtered_df_srf_cs_copy['cs_count'] = filtered_df_srf_cs_copy['merged_string'].apply(count_cs)

# Filter articles based on multiple conditions
credit_suisse_articles = filtered_df_srf_cs_copy[
    (filtered_df_srf_cs_copy['credit_suisse_count'] >= 2) |
    (filtered_df_srf_cs_copy['cs_count'] >= 2) |
    ((filtered_df_srf_cs_copy['credit_suisse_count'] >= 1) & (filtered_df_srf_cs_copy['cs_count'] >= 1))
]


In [41]:
credit_suisse_articles.head()

Unnamed: 0,article_title,merged_string,text_date,credit_suisse_count,cs_count
37,167 Jahre Credit Suisse: Das Ende einer Tradit...,Am Anfang stand der Industrielle und Politiker...,2023-03-19 19:22:00+00:00,2,5
53,UBS will offenbar mehr als die Hälfte der CS-B...,Bei der von der UBS übernommenen Credit Suisse...,2023-06-27 16:14:00+00:00,1,5
54,Auch andere Banken wollen wie die CS behandelt...,"Der Credit Suisse Geld leihen, ohne dafür Sich...",2023-06-24 15:15:00+00:00,2,1
55,Nationalbank zieht die Lehren aus der UBS/CS-F...,Der schnelle Untergang der Credit Suisse im Mä...,2023-06-22 07:59:00+00:00,3,1
56,UBS- und CS-Angestellte haben nun den gleichen...,Die UBS hat nach der Übernahme der Credit Suis...,2023-06-19 13:08:00+00:00,2,2


In [42]:
len(credit_suisse_articles)

396

In [44]:
# Calculate the word count for each article in the credit_suisse_articles DataFrame
credit_suisse_articles.loc[:, 'word_count'] = credit_suisse_articles['merged_string'].str.split().apply(len)


In [46]:
# Sort the articles by date in ascending order
credit_suisse_articles_sorted = credit_suisse_articles.sort_values(by='text_date')

In [48]:
# rename merged_string column to article_content
credit_suisse_articles_sorted.columns
credit_suisse_articles_sorted.rename(columns={'merged_string': 'article_content'}, inplace=True)

In [52]:
# Export the sorted DataFrame to a CSV file
credit_suisse_articles_sorted.to_csv('Prepped_Data/CS_SRF_prepped.csv', index=False)

In [51]:
duplicate_indices_specific_columns = credit_suisse_articles_sorted[credit_suisse_articles_sorted.duplicated(subset=['text_date'], keep=False)].index
print("Indices of Duplicate Rows Based on Specific Columns:")
print(duplicate_indices_specific_columns)

Indices of Duplicate Rows Based on Specific Columns:
Index([], dtype='int64')


### UBS DATA PREP

In [2]:
# import SRF UBS data
file_path = "Data/SRF/scraped_data_UBS_april.csv"
df_srf_ubs = pd.read_csv(file_path)

In [3]:
df_srf_ubs.head()

Unnamed: 0,article_title,merged_string,text_date
0,UBS und Käse: Guerreiro do Divino Amor spielt ...,Guerreiro do Divino Amor – der Name ist portug...,2024-04-19T08:24:00+02:00
1,Die UBS muss sterblich werden,Die Schweiz hat nur noch eine Grossbank. Wenn ...,2024-04-10T18:13:00+02:00
2,Welche Regeln braucht es für die UBS?,Die Politik ist herausgefordert. Nach dem Deba...,2024-04-10T14:31:00+02:00
3,Wie gerecht sind die 14.4 Millionen Franken Lo...,14.4 Millionen Franken Entschädigung hat UBS-C...,2024-03-28T18:44:00+01:00
4,UBS-Chef Ermotti verdient mehr als Vorgänger H...,Seit seiner Rückkehr zur UBS hat CEO Sergio Er...,2024-03-28T07:55:00+01:00


In [4]:
len(df_srf_ubs)

986

In [5]:
# Convert 'text_date' column to datetime type, considering timezone information
df_srf_ubs['text_date'] = pd.to_datetime(df_srf_ubs['text_date'], utc=True)

# Define the start and end dates of the desired range
start_date = pd.Timestamp('2019-01-01', tz='UTC')
end_date = pd.Timestamp('2023-06-30', tz='UTC')

# Filter the DataFrame to include only articles within the specified date range
filtered_df_srf_ubs = df_srf_ubs[(df_srf_ubs['text_date'] >= start_date) & (df_srf_ubs['text_date'] <= end_date)]

In [6]:
len(filtered_df_srf_ubs)

310

In [7]:
# Create a copy of the filtered DataFrame
filtered_df_srf_ubs_copy = filtered_df_srf_ubs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
def count_ubs(text):
    return len(re.findall(r'(?<![a-zA-Z])UBS(?![a-zA-Z])', text))
filtered_df_srf_ubs_copy['UBS_count'] = filtered_df_srf_ubs_copy['merged_string'].apply(count_ubs)

# Filter articles containing 'Credit Suisse' at least twice in their content
UBS_articles = filtered_df_srf_ubs_copy[filtered_df_srf_ubs_copy['UBS_count'] >= 2]

In [8]:
len(UBS_articles)

234

In [11]:
# Calculate the word count for each article in the credit_suisse_articles DataFrame
#execute twice to make it work
UBS_articles.loc[:, 'word_count'] = UBS_articles['merged_string'].str.split().apply(len)

In [12]:
# Sort the articles by date in ascending order
UBS_articles_sorted = UBS_articles.sort_values(by='text_date')

In [13]:
# rename merged_string column to article_content
UBS_articles_sorted.columns
UBS_articles_sorted.rename(columns={'merged_string': 'article_content'}, inplace=True)

#### check for dups

In [14]:
# Assuming credit_suisse_articles_copy is your DataFrame
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns

display(UBS_articles_sorted)

Unnamed: 0,article_title,article_content,text_date,UBS_count,word_count
226,UBS-Datendieb zu 40 Monaten Haft verurteilt,Das Bundesstrafgericht hat einen Ex-UBS-Mitarb...,2019-01-21 13:23:00+00:00,4,368
225,UBS verdient im Schlussquartal weniger als erw...,Die UBS hat im vierten Quartal 2018 die Erwart...,2019-01-22 04:57:00+00:00,4,372
224,Ein fragwürdiger UBS-Kredit mit gravierenden F...,Als Peter O’Neill 2011 in Papua-Neuguinea an d...,2019-02-08 16:00:00+00:00,9,467
223,"«Die UBS muss sich fragen, ob sie ihre Pflicht...",In Papua-Neuguinea hat ein fragwürdiger Kredit...,2019-02-08 16:03:00+00:00,8,370
483,Der ungewollte Abschied,Eine Viertelstunde vor Mitternacht lässt die U...,2019-02-18 17:08:00+00:00,4,824
222,UBS muss 4.5 Milliarden Euro zahlen,Im Prozess am Pariser Strafgerichtshof gegen d...,2019-02-20 11:52:00+00:00,10,227
221,Ein Dämpfer für die UBS,Das Gericht ist dem Strafantrag der Staatsanwä...,2019-02-20 14:41:00+00:00,9,343
220,«Die UBS hat ein echtes Problem»,Sowohl die UBS-Banker wie auch deren Anwälte h...,2019-02-20 17:50:00+00:00,8,523
219,«Noch ist die Geschichte nicht bewältigt»,Nach der Verurteilung der UBS in Frankreich zu...,2019-02-21 04:49:00+00:00,3,260
218,Hängige Milliardenbusse für UBS weckt neue Zwe...,Im Licht des jüngsten Gerichtsentscheides gege...,2019-03-06 09:26:00+00:00,2,295


In [15]:
duplicate_indices_specific_columns = UBS_articles_sorted[UBS_articles_sorted.duplicated(subset=['article_content'], keep=False)].index
print("Indices of Duplicate Rows Based on Specific Columns:")
print(duplicate_indices_specific_columns)

Indices of Duplicate Rows Based on Specific Columns:
Index([699, 698], dtype='int64')


In [16]:
UBS_articles_sorted.loc[698, "article_title"]

'Viele Konzerne trotzen der Coronakrise'

In [17]:
len(UBS_articles_sorted)

234

In [76]:
# Remove duplicate rows based on specific columns
UBS_articles_no_dups = UBS_articles_sorted.drop_duplicates(subset=['article_content'], keep='first')

In [77]:
len(UBS_articles_no_dups)

233

In [78]:
# Export the sorted DataFrame to a CSV file
UBS_articles_no_dups.to_csv('Prepped_Data/UBS_SRF_prepped.csv', index=False)