In [2]:
import pandas as pd
import numpy as np
import re
from IPython.display import display

In [3]:
# import CNBC CS data
file_path = "Data/CNBC/CNBC_scraped_data_CS_22_04.csv"
df_cnbc_cs = pd.read_csv(file_path)

In [4]:
df_cnbc_cs.head()

Unnamed: 0,article_title,article_headline,article_content,text_date,article_url
0,UBS CEO says integrating Credit Suisse will be...,Credit Suisse collapsed in March 2023 followin...,In this article The mammoth integration of fai...,2024-04-05,https://www.cnbc.com/2024/04/05/ubs-ceo-says-c...
1,Swiss regulator calls for more powers after Cr...,The 167-year-old Credit Suisse was rescued by ...,Switzerland's financial regulator on Tuesday c...,,https://www.cnbc.com/2023/12/19/swiss-regulato...
2,UBS resumes selling the bonds at the heart of ...,UBS confirmed to CNBC that it is offering Addi...,In this article UBS on Wednesday began selling...,2023-11-08,https://www.cnbc.com/2023/11/08/ubs-resumes-se...
3,Credit Suisse intervention avoided 'financial ...,The Swiss National Bank supplied a massive lif...,Swiss National Bank Chairman Thomas Jordan on ...,2023-11-02,https://www.cnbc.com/2023/11/01/snb-says-credi...
4,UBS ends Credit Suisse's government and centra...,UBS on Friday said that it has ended a 9 billi...,UBS on Friday said that it has ended a 9 billi...,2023-08-11,https://www.cnbc.com/2023/08/11/ubs-ends-credi...


In [5]:
df_cnbc_cs.columns

Index(['article_title', 'article_headline', 'article_content', 'text_date',
       'article_url'],
      dtype='object')

#### fill missing values in text_date column

In [6]:
#define function to extract the date from the corresponding URL
def extract_date_from_url(url):
    match = re.search(r'\d{4}/\d{2}/\d{2}', url)
    if match:
        date_str = match.group(0)
        # Transforming date into the desired format (YYYY-MM-DD)
        year, month, day = date_str.split('/')
        return f"{year}-{month}-{day}"
    else:
        return None

extract_date_from_url("https://www.cnbc.com/2023/12/19/swiss-regulator-calls-for-more-powers-after-credit-suisse-collapse.html?&qsearchterm=")

'2023-12-19'

In [7]:
#fill missing values
df_cnbc_cs['text_date'] = df_cnbc_cs.apply(lambda row: extract_date_from_url(row['article_url']) if pd.isna(row['text_date']) else row['text_date'], axis=1)

In [8]:
df_cnbc_cs.head()

Unnamed: 0,article_title,article_headline,article_content,text_date,article_url
0,UBS CEO says integrating Credit Suisse will be...,Credit Suisse collapsed in March 2023 followin...,In this article The mammoth integration of fai...,2024-04-05,https://www.cnbc.com/2024/04/05/ubs-ceo-says-c...
1,Swiss regulator calls for more powers after Cr...,The 167-year-old Credit Suisse was rescued by ...,Switzerland's financial regulator on Tuesday c...,2023-12-19,https://www.cnbc.com/2023/12/19/swiss-regulato...
2,UBS resumes selling the bonds at the heart of ...,UBS confirmed to CNBC that it is offering Addi...,In this article UBS on Wednesday began selling...,2023-11-08,https://www.cnbc.com/2023/11/08/ubs-resumes-se...
3,Credit Suisse intervention avoided 'financial ...,The Swiss National Bank supplied a massive lif...,Swiss National Bank Chairman Thomas Jordan on ...,2023-11-02,https://www.cnbc.com/2023/11/01/snb-says-credi...
4,UBS ends Credit Suisse's government and centra...,UBS on Friday said that it has ended a 9 billi...,UBS on Friday said that it has ended a 9 billi...,2023-08-11,https://www.cnbc.com/2023/08/11/ubs-ends-credi...


In [9]:
len(df_cnbc_cs)

221

In [10]:
df_cnbc_cs.loc[7]

article_title       UBS says it has completed the takeover of stri...
article_headline    UBS has completed the legal takeover of its fo...
article_content     In this article Swiss bank UBS on Monday said ...
text_date                                                  2023-06-12
article_url         https://www.cnbc.com/2023/06/12/ubs-says-it-ha...
Name: 7, dtype: object

#### convert date

In [11]:
# Convert 'text_date' column to datetime type, considering timezone information
df_cnbc_cs['text_date'] = pd.to_datetime(df_cnbc_cs['text_date'], utc=True)

# Define the start and end dates of the desired range
start_date = pd.Timestamp('2019-01-01', tz='UTC')
end_date = pd.Timestamp('2023-06-30', tz='UTC')

# Filter the DataFrame to include only articles within the specified date range
filtered_df_cnbc_cs = df_cnbc_cs[(df_cnbc_cs['text_date'] >= start_date) & (df_cnbc_cs['text_date'] <= end_date)]

In [12]:
len(filtered_df_cnbc_cs)

143

#### sort by relevancy

##### old code

In [29]:
# Create a copy of the filtered DataFrame
#filtered_df_cnbc_cs_copy = filtered_df_cnbc_cs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
#filtered_df_cnbc_cs_copy['credit_suisse_count'] = filtered_df_cnbc_cs_copy['article_content'].str.count('Credit Suisse')

#def count_cs(text):
#    return len(re.findall(r'CS(?![a-zA-Z])', text))

#filtered_df_cnbc_cs_copy['cs_count'] = filtered_df_cnbc_cs_copy['article_content'].apply(count_cs)

# Filter articles based on multiple conditions
#credit_suisse_articles_cnbc = filtered_df_cnbc_cs_copy[
#    (filtered_df_cnbc_cs_copy['credit_suisse_count'] >= 2) |
#    (filtered_df_cnbc_cs_copy['cs_count'] >= 2) |
#    ((filtered_df_cnbc_cs_copy['credit_suisse_count'] >= 1) & (filtered_df_cnbc_cs_copy['cs_count'] >= 1))
#]


##### new code

In [29]:
# Create a copy of the filtered DataFrame
filtered_df_cnbc_cs_copy = filtered_df_cnbc_cs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
def count_credit_suisse(text):
    return len(re.findall(r'credit\s*suisse', text, re.IGNORECASE))
    
filtered_df_cnbc_cs_copy['credit_suisse_count'] = filtered_df_cnbc_cs_copy['article_content'].apply(count_credit_suisse)

def count_cs(text):
    return len(re.findall(r'(?<![a-zA-Z])CS(?![a-zA-Z])', text))

filtered_df_cnbc_cs_copy['cs_count'] = filtered_df_cnbc_cs_copy['article_content'].apply(count_cs)


# Filter articles based on multiple conditions
credit_suisse_articles_cnbc = filtered_df_cnbc_cs_copy[
    (filtered_df_cnbc_cs_copy['credit_suisse_count'] >= 2) |
    (filtered_df_cnbc_cs_copy['cs_count'] >= 2) |
    ((filtered_df_cnbc_cs_copy['credit_suisse_count'] >= 1) & (filtered_df_cnbc_cs_copy['cs_count'] >= 1))
]


In [30]:
credit_suisse_articles_cnbc.head()

Unnamed: 0,article_title,article_headline,article_content,text_date,article_url,credit_suisse_count,cs_count
7,UBS says it has completed the takeover of stri...,UBS has completed the legal takeover of its fo...,In this article Swiss bank UBS on Monday said ...,2023-06-12 00:00:00+00:00,https://www.cnbc.com/2023/06/12/ubs-says-it-ha...,19,0
8,Credit Suisse logged asset outflows of more th...,Swiss authorities brokered the controversial 3...,In this article Credit Suisse on Monday reveal...,2023-04-24 00:00:00+00:00,https://www.cnbc.com/2023/04/24/credit-suisse-...,12,0
9,UBS expects $17 billion hit from Credit Suisse...,The bank's emergency acquisition of its strick...,UBS estimates a financial hit of around $17 bi...,2023-05-17 00:00:00+00:00,https://www.cnbc.com/2023/05/17/ubs-expects-17...,11,0
10,Swiss central bank promises regulation review ...,The central bank played a key role in brokerin...,The Swiss National Bank on Friday pledged to r...,2023-04-28 00:00:00+00:00,https://www.cnbc.com/2023/04/28/swiss-national...,9,0
11,Switzerland faced a full-scale bank run if Cre...,Allowing the bankruptcy of troubled lender Cre...,In this article Allowing the bankruptcy of tro...,2023-04-05 00:00:00+00:00,https://www.cnbc.com/2023/04/05/switzerland-fa...,18,1


In [31]:
len(credit_suisse_articles_cnbc)

133

In [33]:
# Calculate the word count for each article in the credit_suisse_articles DataFrame
credit_suisse_articles_cnbc.loc[:, 'word_count'] = credit_suisse_articles_cnbc['article_content'].str.split().apply(len)


In [34]:
# Sort the articles by date in ascending order
credit_suisse_articles_cnbc_sorted = credit_suisse_articles_cnbc.sort_values(by='text_date')

# Export the sorted DataFrame to a CSV file
credit_suisse_articles_cnbc_sorted.to_csv('Prepped_Data/CS_CNBC_prepped.csv', index=False)

In [35]:
duplicate_indices_specific_columns = credit_suisse_articles_cnbc_sorted[credit_suisse_articles_cnbc_sorted.duplicated(subset=['article_content'], keep=False)].index
print("Indices of Duplicate Rows Based on Specific Columns:")
print(duplicate_indices_specific_columns)

Indices of Duplicate Rows Based on Specific Columns:
Index([], dtype='int64')


### UBS Data prep

In [46]:
# import SRF UBS data
file_path = "Data/CNBC/CNBC_scraped_data_UBS_23_04.csv"
df_cnbc_ubs = pd.read_csv(file_path)

In [47]:
df_cnbc_ubs.head()

Unnamed: 0,article_title,article_headline,article_content,text_date,article_url
0,'Lose-lose situation': New Swiss bank laws cou...,"In a 209-page plan published Wednesday, the Sw...",Switzerland's tough new banking regulations cr...,2024-04-11,https://www.cnbc.com/2024/04/11/lose-lose-situ...
1,Swiss banking giant UBS to launch share buybac...,"""Our ambition is for share repurchases to exce...",UBS on Tuesday announced a new share repurchas...,2024-04-02,https://www.cnbc.com/2024/04/02/swiss-banking-...
2,UBS CEO says integrating Credit Suisse will be...,Credit Suisse collapsed in March 2023 followin...,In this article The mammoth integration of fai...,2024-04-05,https://www.cnbc.com/2024/04/05/ubs-ceo-says-c...
3,UBS chief's surprise return to the Swiss banki...,The bank announced in late March that Ermotti ...,UBS CEO Sergio Ermotti earned 14.4 million Swi...,,https://www.cnbc.com/2024/03/28/ubs-chiefs-sur...
4,"UBS beats earnings expectations, announces up ...",The group posted a net loss attributable to sh...,Swiss banking giant UBS on Tuesday narrowly be...,2024-02-06,https://www.cnbc.com/2024/02/06/ubs-beats-earn...


#### fill missing values in text_date column

In [48]:
#fill missing values
df_cnbc_ubs['text_date'] = df_cnbc_ubs.apply(lambda row: extract_date_from_url(row['article_url']) if pd.isna(row['text_date']) else row['text_date'], axis=1)

In [49]:
df_cnbc_ubs.head()

Unnamed: 0,article_title,article_headline,article_content,text_date,article_url
0,'Lose-lose situation': New Swiss bank laws cou...,"In a 209-page plan published Wednesday, the Sw...",Switzerland's tough new banking regulations cr...,2024-04-11,https://www.cnbc.com/2024/04/11/lose-lose-situ...
1,Swiss banking giant UBS to launch share buybac...,"""Our ambition is for share repurchases to exce...",UBS on Tuesday announced a new share repurchas...,2024-04-02,https://www.cnbc.com/2024/04/02/swiss-banking-...
2,UBS CEO says integrating Credit Suisse will be...,Credit Suisse collapsed in March 2023 followin...,In this article The mammoth integration of fai...,2024-04-05,https://www.cnbc.com/2024/04/05/ubs-ceo-says-c...
3,UBS chief's surprise return to the Swiss banki...,The bank announced in late March that Ermotti ...,UBS CEO Sergio Ermotti earned 14.4 million Swi...,2024-03-28,https://www.cnbc.com/2024/03/28/ubs-chiefs-sur...
4,"UBS beats earnings expectations, announces up ...",The group posted a net loss attributable to sh...,Swiss banking giant UBS on Tuesday narrowly be...,2024-02-06,https://www.cnbc.com/2024/02/06/ubs-beats-earn...


In [50]:
len(df_cnbc_ubs)

217

In [51]:
# Convert 'text_date' column to datetime type, considering timezone information
df_cnbc_ubs['text_date'] = pd.to_datetime(df_cnbc_ubs['text_date'], utc=True)

# Define the start and end dates of the desired range
start_date = pd.Timestamp('2019-01-01', tz='UTC')
end_date = pd.Timestamp('2023-06-30', tz='UTC')

# Filter the DataFrame to include only articles within the specified date range
filtered_df_cnbc_ubs = df_cnbc_ubs[(df_cnbc_ubs['text_date'] >= start_date) & (df_cnbc_ubs['text_date'] <= end_date)]

In [52]:
len(filtered_df_cnbc_ubs)

127

#### filter by relevancy

##### old code

In [44]:
# Create a copy of the filtered DataFrame
#filtered_df_cnbc_ubs_copy = filtered_df_cnbc_ubs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
#filtered_df_cnbc_ubs_copy['UBS_count'] = filtered_df_cnbc_ubs_copy['article_content'].str.count('UBS')

# Filter articles containing 'Credit Suisse' at least twice in their content
#UBS_articles_cnbc = filtered_df_cnbc_ubs_copy[filtered_df_cnbc_ubs_copy['UBS_count'] >= 2]

##### new code

In [53]:
# Create a copy of the filtered DataFrame
filtered_df_cnbc_ubs_copy = filtered_df_cnbc_ubs.copy()

# Count the occurrences of the phrases 'Credit Suisse' and 'CS' in each article's content in the copied DataFrame
def count_ubs(text):
    return len(re.findall(r'(?<![a-zA-Z])UBS(?![a-zA-Z])', text))
    
filtered_df_cnbc_ubs_copy['UBS_count'] = filtered_df_cnbc_ubs_copy['article_content'].apply(count_ubs)

# Filter articles containing 'Credit Suisse' at least twice in their content
UBS_articles_cnbc = filtered_df_cnbc_ubs_copy[filtered_df_cnbc_ubs_copy['UBS_count'] >= 2]

In [54]:
len(UBS_articles_cnbc)

110

In [56]:
# Calculate the word count for each article in the credit_suisse_articles DataFrame
UBS_articles_cnbc.loc[:, 'word_count'] = UBS_articles_cnbc['article_content'].str.split().apply(len)

In [57]:
# Sort the articles by date in ascending order
UBS_articles_cnbc_sorted = UBS_articles_cnbc.sort_values(by='text_date')

# Export the sorted DataFrame to a CSV file
UBS_articles_cnbc_sorted.to_csv('Prepped_Data/UBS_CNBC_prepped.csv', index=False)

In [59]:
duplicate_indices_specific_columns = UBS_articles_cnbc_sorted[UBS_articles_cnbc_sorted.duplicated(subset=['article_content'], keep=False)].index
print("Indices of Duplicate Rows Based on Specific Columns:")
print(duplicate_indices_specific_columns)

Indices of Duplicate Rows Based on Specific Columns:
Index([], dtype='int64')
