# Scrape Webull via copy-paste
This would be used for AMC (2020-2022) and ZM (2020). Faster and more efficient!

In [22]:
import pandas as pd

In [132]:
# create an empty dataframe with two columns
df = pd.DataFrame(columns=['date', 'news_headline'])

# read the text file
with open('2020-zoom-manual-scrape.txt', 'r') as f:
    lines = f.readlines()

In [133]:
# iterate through each line in the file
for i in range(len(lines)):
    # check if the line contains a date
    if lines[i].count('/') == 2:
        # extract the date
        date = lines[i][-17:-7]
        # extract text
        text = lines[i-1].strip()
        # add the text and date to the dataframe
        df = df.append({'date': date, 'news_headline': text}, ignore_index=True)
    else:
        # move past the line
        continue

In [134]:
display(df.head())

Unnamed: 0,date,news_headline
0,12/31/2020,Tesla topped both the S&P 500 and Nasdaq 100 b...
1,12/31/2020,"Every week, Benzinga conducts a sentiment surv..."
2,12/31/2020,The tech sector outperformed in 2020 as a numb...
3,12/31/2020,Plaintiffs accusing Zoom Video Communications ...
4,12/31/2020,Loup Ventures has published its annual list of...


# Check for invalid dates

In [135]:
# convert the Date column to a datetime format
df['date'] = pd.to_datetime(df['date'], format='%m/%d/%Y', errors='coerce')
display(df.head())

# check if any rows have an invalid date
mask = df['date'].isnull()
if mask.any():
    print('The following rows have an invalid date:')
    display(df[mask])
else:
    print('All rows have a valid date.')

Unnamed: 0,date,news_headline
0,2020-12-31,Tesla topped both the S&P 500 and Nasdaq 100 b...
1,2020-12-31,"Every week, Benzinga conducts a sentiment surv..."
2,2020-12-31,The tech sector outperformed in 2020 as a numb...
3,2020-12-31,Plaintiffs accusing Zoom Video Communications ...
4,2020-12-31,Loup Ventures has published its annual list of...


The following rows have an invalid date:


Unnamed: 0,date,news_headline
197,NaT,Snowflake IPO: In-Depth Analysis
199,NaT,Bandwidth: Leading The Transition In Cloud-Bas...
202,NaT,AudioCodes: The Sounds Of FCF Generation
234,NaT,Oracle Highlights Integration Between Oracle F...
304,NaT,Don't Buy High And Sell Low: What Today's Inve...
357,NaT,Wall Street Breakfast: High Profile Stock Splits
652,NaT,Last year's Bay Area IPO performances show why...


In [136]:
# delete any row with an invalid date
df = df[pd.to_datetime(df['date'], format='%m/%d/%Y', errors='coerce').notnull()]

In [137]:
# check if any rows have an invalid date
mask = df['date'].isnull()
if mask.any():
    print('The following rows have an invalid date:')
    display(df[mask])
else:
    print('All rows have a valid date.')

All rows have a valid date.


# Check for invalid news headlines

## Remove empty headlines

In [138]:
# check for empty rows
# create a boolean mask for empty rows
mask = df['news_headline'].apply(lambda x: isinstance(x, str) and x.strip() == '')

# display the rows that are empty
if mask.any():
    print('The following rows are empty:')
    display(df[mask])
else:
    print('There are no empty rows.')

There are no empty rows.


In [139]:
# remove them
df = df[df['news_headline'].apply(lambda x: isinstance(x, str) and x.strip() != '')]

In [140]:
# create a boolean mask for empty rows
mask = df['news_headline'].apply(lambda x: isinstance(x, str) and x.strip() == '')

# display the rows that are empty
if mask.any():
    print('The following rows are empty:')
    display(df[mask])
else:
    print('There are no empty rows.')

There are no empty rows.


## Remove headlines with links

In [141]:
# check for rows with links
# create a boolean mask for linked rows
mask = df['news_headline'].apply(lambda x: isinstance(x, str) and 'http' in x)

# display the rows that are empty
if mask.any():
    print('The following rows have links:')
    display(df[mask])
else:
    print('There are no rows with links.')

The following rows have links:


Unnamed: 0,date,news_headline
123,2020-11-13,https://baltimore.cbslocal.com/2020/11/13/covi...
161,2020-10-15,https://9to5google.com/2020/10/15/hangouts-fre...
174,2020-10-14,https://investors.zoom.us/static-files/cc304d8...
212,2020-10-06,https://about.fb.com/news/2020/10/bringing-net...
276,2020-09-09,https://citronresearch.com/wp-content/uploads/...
372,2020-08-24,https://www.wftv.com/news/education/florida-ju...
414,2020-07-23,https://www.cnbc.com/2020/07/23/cdc-guidelines...
417,2020-07-22,https://www.seattletimes.com/seattle-news/educ...
430,2020-07-08,https://techcommunity.microsoft.com/t5/microso...
442,2020-07-01,https://blog.zoom.us/ceo-report-90-days-done-w...


In [142]:
# remove them
df = df[df['news_headline'].apply(lambda x: isinstance(x, str) and 'http' not in x)]

In [143]:
# check for rows with links
# create a boolean mask for linked rows
mask = df['news_headline'].apply(lambda x: isinstance(x, str) and 'http' in x)

# display the rows that are empty
if mask.any():
    print('The following rows have links:')
    display(df[mask])
else:
    print('There are no rows with link.')

There are no rows with link.


# Remove rows with '...'

In [144]:
# check for rows with '...'
# create a boolean mask for the rows
mask = df['news_headline'].apply(lambda x: isinstance(x, str) and '...' in x)

# display the rows that are empty
if mask.any():
    print("The following rows have '...' in the text:")
    display(df[mask])
else:
    print("There are no rows with '...' in the text")

The following rows have '...' in the text:


Unnamed: 0,date,news_headline
47,2020-12-09,Things could get &#39;interesting&#39; if some...
65,2020-12-01,Avoid the long side until the dust settles....ZM
95,2020-11-30,"Remember, if you understand markets, this has ..."
105,2020-11-24,Let's decide if investors should consider buyi...
142,2020-10-21,An extended stock with slowing momentum is not...
260,2020-09-17,ZM could resume its advance after some further...
274,2020-09-09,Rev Shark is Signing In to Zoom Video Stock Ag...
366,2020-08-26,Here are three tech stocks with strong growth ...


In [145]:
# remove them
df = df[df['news_headline'].apply(lambda x: isinstance(x, str) and '...' not in x)]

In [146]:
# check for rows with '...'
# create a boolean mask for the rows
mask = df['news_headline'].apply(lambda x: isinstance(x, str) and '...' in x)

# display the rows that are empty
if mask.any():
    print("The following rows have '...' in the text:")
    display(df[mask])
else:
    print("There are no rows with '...' in the text")

There are no rows with '...' in the text


## Remove headlines with less than 4 words

In [147]:
# check for rows with 1 or 2 words in the headlines
# display rows with 4 words or less
mask = df['news_headline'].apply(lambda x: isinstance(x, str) and len(x.split()) <= 4)
if mask.any():
    print('The following rows have 4 words or less in the News Headline:')
    display(df[mask])
else:
    print('No rows have 4 words or less in the News Headline.')

The following rows have 4 words or less in the News Headline:


Unnamed: 0,date,news_headline
29,2020-12-18,-Reuters
60,2020-12-02,Is Zoom's Rally Stalling?
97,2020-11-30,By Geoffrey Smith
164,2020-10-15,By Christiana Sciaudone
171,2020-10-14,-Reuters
172,2020-10-14,-Reuters
175,2020-10-14,-Reuters
214,2020-10-06,-Bloomberg
309,2020-09-03,By Christiana Sciaudone
374,2020-08-24,-Reuters


In [148]:
# remove them
df = df[df['news_headline'].apply(lambda x: isinstance(x, str) and len(x.split()) > 4)]

In [149]:
# check for rows with 1 or 2 words in the headlines
# display rows with 4 words or less
mask = df['news_headline'].apply(lambda x: isinstance(x, str) and len(x.split()) <= 3)
if mask.any():
    print('The following rows have 4 words or less in the News Headline:')
    display(df[mask])
else:
    print('No rows have 4 words or less in the News Headline.')

No rows have 4 words or less in the News Headline.


# Save cleaned dataframe

In [150]:
# Save the DataFrame as a CSV file
df.to_csv("2020_webull_zoom_stock_news.csv", index=False)