# Exploratory Data Analysis for Financial Data at Nova Financial Solutions
## 1. Import Libraries and utility functions

In [1]:
import sys
import os

notebook_dir = os.getcwd()  # Get current working directory (notebooks folder)
project_dir = os.path.abspath(os.path.join(notebook_dir, '..'))  # Go up one level
scripts_dir = os.path.join(project_dir, 'src')  # Path to scripts
sys.path.append(scripts_dir)


In [2]:
from IPython.display import display
from load_data import load_data
import pandas as pd

In [3]:
import importlib
import eda_data
importlib.reload(eda_data)
from eda_data import (data_overview, descriptive_statistics, text_analysis, time_series_analysis, publisher_analysis)

## 2. Importing Financial News and Stock Price Integration Data

In [4]:
# Load data
df = load_data('../data/raw/raw_analyst_ratings.csv')

## 3. Data Overviewing

In [5]:
overview = data_overview(df)
print(overview)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1407328 entries, 0 to 1407327
Data columns (total 6 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   Unnamed: 0  1407328 non-null  int64 
 1   headline    1407328 non-null  object
 2   url         1407328 non-null  object
 3   publisher   1407328 non-null  object
 4   date        1407328 non-null  object
 5   stock       1407328 non-null  object
dtypes: int64(1), object(5)
memory usage: 64.4+ MB
{'info': None, 'type': Unnamed: 0     int64
headline      object
url           object
publisher     object
date          object
stock         object
dtype: object, 'shape': (1407328, 6), 'isnull': Unnamed: 0    0
headline      0
url           0
publisher     0
date          0
stock         0
dtype: int64}


In [6]:
#Droping unamed column
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,headline,url,publisher,date,stock
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 10:30:54-04:00,A
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 10:45:20-04:00,A
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 04:30:07-04:00,A
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 12:45:06-04:00,A
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 11:38:59-04:00,A


## 1. Descriptive Statistics

In [7]:
ds = descriptive_statistics(df)

In [8]:
#Headline length description
print("Headline Length",ds['headline_length'])

Headline Length count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64


In [9]:
#Active publishers description
print("active_publishers",ds['active_publishers'])

active_publishers count      1034.000000
mean       1361.052224
std       11501.954263
min           1.000000
25%           3.000000
50%          11.000000
75%          93.500000
max      228373.000000
Name: count, dtype: float64


In [10]:
# Frequency per day
print("date_counts",ds['date_counts'])

date_counts date_trend
2009-02-14      1
2009-04-27      2
2009-04-29      1
2009-05-22      1
2009-05-27      6
             ... 
2020-06-07     25
2020-06-08    765
2020-06-09    803
2020-06-10    807
2020-06-11    544
Name: count, Length: 3955, dtype: int64


In [11]:
# Frequency per hr
print("time_counts", ds['time_counts'])

time_counts hour_trend
0     1351472
1          82
2          48
3          27
4          67
5          14
6          57
7          93
8        1469
9        1829
10       2476
11       5033
12       5527
13       5965
14       7669
15       5701
16       5732
17       2710
18       2075
19       1612
20       3939
21       2800
22        704
23        227
Name: count, dtype: int64


## 2. Text Analysis

In [12]:
# Text analysis
txt_analysis = text_analysis(df)
print(txt_analysis)

{'noun_phrases': [('vs $', 53977), ("benzinga 's", 43860), ('announces $', 23971), ('raises pt', 21124), ('top upgrades', 21108), ('eps $', 15098), ('52-week highs', 14638), ('raises price target', 14562)], 'combined_keywords': [('vs $', 53977), ("benzinga 's", 43860), ('announces $', 23971), ('raises pt', 21124), ('top upgrades', 21108), ('eps $', 15098), ('52-week highs', 14638), ('raises price target', 14562)], 'error_message': []}


## 3. Time Series Analysis

In [16]:
#Time series analysis
daily_freq, hourly_freq, day_freq, month_freq = time_series_analysis(df)
print(month_freq)

month
April        121813
August       124041
December     105123
February     122836
January      121545
July         110764
June         106598
March        121949
May          130340
November     121430
October      124800
September     96089
Name: count, dtype: int64


## 4. Publisher Analysis


In [14]:
#Publisher Analysis
pub_analysis = publisher_analysis(df)
print(pub_analysis)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  email_publishers['domain'] = email_publishers['publisher'].apply(


{'top_publishers': publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: count, dtype: int64, 'top_domains': domain
benzinga.com              7937
gmail.com                  139
andyswan.com                 5
investdiva.com               2
tothetick.com                2
eosdetroit.io                1
forextraininggroup.com       1
stockmetrix.net              1
Name: count, dtype: int64}
