In [16]:
from bs4 import BeautifulSoup
import requests 
from datetime import date
import random
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [17]:
dataset = pd.read_csv('TestTask_imp.csv')
dataset = dataset.drop(['Unnamed: 0', 'Unnamed: 4', 'Unnamed: 5', 'label.1'], axis=1)
dataset.label = dataset.label.fillna(0)
dataset.label = dataset.label.astype(int)
dataset

Unnamed: 0,label,header
0,1,Manchester United PLC Reports First Quarter Fi...
1,0,Fed Minutes Due Next Week
2,0,Manchester United plc Announces First Quarter ...
3,0,NuCana Announces FDA Clearance to Commence Pha...
4,0,Elucid - mHealth smart pill bottle to monitor ...
...,...,...
1614,0,"Nuclear Energy Market Insight Inflation, Massi..."
1615,0,E.ON Secures Tax Equity Financing from GE Ener...
1616,0,DGAP-News: E.ON SE: E.ON closes purchase of in...
1617,0,Parallel NOR Flash Market: Emerging Players Se...


In [18]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english', ngram_range=(1, 3), max_features=500, lowercase=True)
lr = LogisticRegression(C=1.0, random_state=43, solver='lbfgs')
pipe = Pipeline([
    ('tfidf', tfidf),
    ('lr', lr)
])
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=43)
pd.DataFrame([cross_val_score(pipe, dataset.header, dataset.label, scoring='f1', cv=skf)])
#np.mean(cv_res)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.722222,0.777778,0.8,0.777778,0.864865,0.923077,0.864865,0.756757,0.516129,0.756757


In [19]:
pipe.fit(dataset.header, dataset.label)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=500,
                                 min_df=1, ngram_range=(1, 3), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scal

In [20]:
samples = [
    ' report',
    'trading report'
]
pipe
pipe.predict_proba(samples)

array([[0.74125854, 0.25874146],
       [0.13490571, 0.86509429]])

In [21]:
def get_companys_earnings_releases(
    pipe,
    day=date.today().strftime("%d"), 
    month=date.today().strftime("%B")[:3], 
    index=None
    ):
    url = 'https://www.marketwatch.com/tools/earningscalendar'
    companies = BeautifulSoup(requests.get(url).text, 'lxml').find("div", {"id":month+day+"page" }).findAll('a')
    try:
        selected_company = companies[min(index, len(companies))] if index else random.choice(companies)
        company_full_info =  {'company': selected_company.text, 'url': 'https://www.marketwatch.com' + selected_company['href']}
        company_soup = BeautifulSoup(requests.get(company_full_info["url"]).text, 'lxml')
        press_releases = company_soup.find("div", {"class":"collection__list j-scrollElement", "data-type":"PressReleases"}).findAll('a')
        press_releases_full_info = [{'url': row['href'], 'name': row.text.strip()} for row in press_releases] 
        earnings_releases = pipe.predict([row.text.strip() for row in press_releases])
        index = list(earnings_releases).index(1)
        return press_releases_full_info[index] 
    except IndexError:
        print("No companies available for selected date.")
        return None
    except AttributeError:
        print("No press releases for selected company: " + str(company_full_info['company']))
        return None
    except ValueError:
        print("No earnings releases for selected company: " + str(company_full_info['company']))
        return None

In [23]:
get_companys_earnings_releases(pipe, index=2, day='25')

{'url': 'https://www.marketwatch.com/press-release/beacon-roofing-supply-reports-fourth-quarter-and-fiscal-year-2019-results-2019-11-25?mod=mw_quote_news',
 'name': 'Beacon Roofing Supply Reports Fourth Quarter and Fiscal Year 2019 Results'}