In [185]:
import requests 
import random
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

### Collecting the data

In [189]:
def get_data(
    day=date.today().strftime("%d"), 
    month=date.today().strftime("%B")[:3]
    ):
    url = 'https://www.marketwatch.com/tools/earningscalendar'
    companies = BeautifulSoup(requests.get(url).text, 'lxml').find("div", {"id":month+day+"page" }).findAll('a')
    full_companies = [{'company': company.text, 'url': 'https://www.marketwatch.com' + company['href']}
                       for company in companies]
    companies_soup = [BeautifulSoup(requests.get(company["url"]).text, 'lxml') for company in full_companies]

    press_releases_find = [el.find('mw-scrollable-news', {'data-channel': re.compile('.*pressrelease.*')}
                             ) for el in companies_soup]
    
    press_releases = [el.findAll('a') for el in press_releases_find if el]
    for el in press_releases:
        for it in el:
            if it ==[]:
                el.remove(it)
    return press_releases

In [190]:
'''
results= []
for j in range(18,30):
    if j not in(23,24,28):
        print(j)
        results.append(get_data(day=str(j)))
raw_data = list(chain(*list(chain(*results))))
data = np.array([(row['href'],  row.text.strip()) for row in raw_data])
data = pd.DataFrame(data)
data.columns= ['url', 'header']
data['label'] = ['' for el in data.url]
data.to_csv(r'TestTask_Train.csv')
df =data.drop('url', axis=1)
df = df.drop_duplicates()
df.to_csv(r'TestTask__________.csv')
'''

"\nresults= []\nfor j in range(18,30):\n    if j not in(23,24,28):\n        print(j)\n        results.append(get_data(day=str(j)))\nraw_data = list(chain(*list(chain(*results))))\ndata = np.array([(row['href'],  row.text.strip()) for row in raw_data])\ndata = pd.DataFrame(data)\ndata.columns= ['url', 'header']\ndata['label'] = ['' for el in data.url]\ndata.to_csv(r'TestTask_Train.csv')\ndf =data.drop('url', axis=1)\ndf = df.drop_duplicates()\ndf.to_csv(r'TestTask__________.csv')\n"

### Visualizing 

In [191]:
dataset = pd.read_csv('TestTask_imp.csv')
dataset = dataset.drop(['Unnamed: 0', 'Unnamed: 4', 'Unnamed: 5', 'label.1'], axis=1)
dataset.label = dataset.label.fillna(0)
dataset.label = dataset.label.astype(int)
dataset

Unnamed: 0,label,header
0,1,Manchester United PLC Reports First Quarter Fi...
1,0,Fed Minutes Due Next Week
2,0,Manchester United plc Announces First Quarter ...
3,0,NuCana Announces FDA Clearance to Commence Pha...
4,0,Elucid - mHealth smart pill bottle to monitor ...
...,...,...
1614,0,"Nuclear Energy Market Insight Inflation, Massi..."
1615,0,E.ON Secures Tax Equity Financing from GE Ener...
1616,0,DGAP-News: E.ON SE: E.ON closes purchase of in...
1617,0,Parallel NOR Flash Market: Emerging Players Se...


### Building and evaluating models

In [None]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 3), max_features=500, lowercase=True)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=43)

In [170]:
lr = LogisticRegression(C=1.0, random_state=43, solver='lbfgs')
lr_pipe = Pipeline([
    ('tfidf', tfidf),
    ('lr', lr)
])
lr_cv_res = cross_val_score(lr_pipe, dataset.header, dataset.label, scoring='f1', cv=skf)
np.mean(lr_cv_res)

0.7760226976356008

In [171]:
rfc = RandomForestClassifier(n_estimators=13, max_depth=51, random_state=43)
rfc_pipe = Pipeline([
    ('tfidf', tfidf),
    ('rfc', rfc)
])
rfc_cv_res = cross_val_score(rfc_pipe, dataset.header, dataset.label, scoring='f1', cv=skf)
np.mean(rfc_cv_res)

0.8458651309615576

In [172]:
xgb = XGBClassifier(max_depth=14, gamma=0.1, min_child_weight=1, random_state=43)
xgb_pipe = Pipeline([
    ('tfidf', tfidf),
    ('xgb', xgb)
])
xgb_cv_res = cross_val_score(xgb_pipe, dataset.header, dataset.label, scoring='f1', cv=skf)
np.mean(xgb_cv_res)

0.8466538386929765

In [174]:
lr_pipe.fit(dataset.header, dataset.label)
rfc_pipe.fit(dataset.header, dataset.label)
xgb_pipe.fit(dataset.header, dataset.label)
samples = [
    'random sentence about earnings report',
    'company reports fourth quarter results'
]
lr_pipe.predict(samples), rfc_pipe.predict(samples), xgb_pipe.predict(samples)

(array([0, 1]), array([0, 1]), array([0, 1]))

### Main function

In [192]:
def get_companys_earnings_releases(
    pipe,
    day=date.today().strftime("%d"), 
    month=date.today().strftime("%B")[:3], 
    index=None
    ):
    url = 'https://www.marketwatch.com/tools/earningscalendar'
    companies = BeautifulSoup(requests.get(url).text, 'lxml').find("div", {"id":month+day+"page" }).findAll('a')
    try:
        selected_company = companies[min(index, len(companies))] if index else random.choice(companies)
        company_full_info =  {'company': selected_company.text, 'url': 'https://www.marketwatch.com' + selected_company['href']}
        company_soup = BeautifulSoup(requests.get(company_full_info["url"]).text, 'lxml')
        press_releases = company_soup.find("div", {"class":"collection__list j-scrollElement", "data-type":"PressReleases"}).findAll('a')
        press_releases_full_info = [{'url': row['href'], 'name': row.text.strip()} for row in press_releases] 
        earnings_releases = pipe.predict([row.text.strip() for row in press_releases])
        index = list(earnings_releases).index(1)
        return press_releases_full_info[index] 
    except IndexError:
        print("No companies available for selected date.")
        return None
    except AttributeError:
        print("No press releases for selected company: " + str(company_full_info['company']))
        return None
    except ValueError:
        print("No earnings releases for selected company: " + str(company_full_info['company']))
        return None

### Comparing models on real data

In [181]:
pd.DataFrame([get_companys_earnings_releases(lr_pipe, index=2, day='25'),
              get_companys_earnings_releases(rfc_pipe, index=2, day='25'),
              get_companys_earnings_releases(xgb_pipe, index=2, day='25'),
             ])

Unnamed: 0,url,name
0,https://www.marketwatch.com/press-release/beac...,Beacon Roofing Supply Reports Fourth Quarter a...
1,https://www.marketwatch.com/press-release/beac...,Beacon Roofing Supply Reports Fourth Quarter a...
2,https://www.marketwatch.com/press-release/beac...,Beacon Roofing Supply Reports Fourth Quarter a...


In [57]:
get_companys_earnings_releases(lr_pipe)

{'url': 'https://www.marketwatch.com/press-release/fro---third-quarter-and-nine-months-2019-results-2019-11-27?mod=mw_quote_news',
 'name': 'FRO - Third Quarter and Nine Months 2019 Results'}