In [228]:
import requests
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler


In [2]:
api_key = "15f343224408961aeb24d8b8cabde1be"
base_url = "https://financialmodelingprep.com/api/v3/"


In [257]:
#getting symbols that are tradable and also have financial statements

statement_symbols_list = requests.get(f"{base_url}financial-statement-symbol-lists?apikey={api_key}").json()

tradable_symbols_list = requests.get(f"{base_url}available-traded/list?apikey={api_key}")
tradable_symbols_list = tradable_symbols_list.json()
symbols_tradable = [obj['symbol'] for obj in tradable_symbols_list]

symbol_market_cap = pd.read_csv("by_market_cap.csv")
symbol_market_cap['has_statement'] = False
symbol_market_cap['is_tradable'] = False

for i in range(len(symbol_market_cap)):
     if symbol_market_cap['Symbol'][i] in symbols_tradable:
          symbol_market_cap['is_tradable'][i] = True
     if symbol_market_cap['Symbol'][i] in statement_symbolsb_list:
          symbol_market_cap['has_statement'][i] = True

target_symbols = symbol_market_cap[(symbol_market_cap['has_statement'] == True) & (symbol_market_cap['is_tradable']== True)].head(100)

In [260]:
target_symbols['Symbol']

0         AAPL
1         MSFT
2      2222.SR
3         GOOG
4         AMZN
        ...   
96          UL
97         BUD
98        SBUX
99         RTX
100         RY
Name: Symbol, Length: 100, dtype: object

In [7]:
#getting the financial analysis for each symbol
def Statement_analysis_caller(symbol, years = 10):
     response = requests.get(f"{base_url}key-metrics/{symbol}?limit={years}&apikey={api_key}").json()
     return response
#gathering historical data in one data frame
df = pd.DataFrame()
for s in target_symbols['Symbol']:
     rows = Statement_analysis_caller(s, years = 6)
     try:
          if rows:
               for i in range(len(rows)):
                    df = df.append(rows[i], ignore_index=True)
                    
               time.sleep(2)
     except KeyError as e:
          print(f"Error in processing {s}: {e}")
          continue 
# deleting everything outside 2018 to 2022
condition = df['date'].apply(lambda x: ('2022' in x) or ('2021' in x) or ('2020' in x) or ('2019' in x) or ('2018' in x))
df_new = df[condition]
df_new = df_new.reset_index(drop=True)
#deleting columns with a lot of zeros
zero_count = (df_new == 0).sum()
columns_to_remove = zero_count[zero_count > 10].index.tolist()
filtered_data = df_new.drop(columns=columns_to_remove)
filtered_data

In [11]:
#getting historical price data for a symbol
def Get_historical_data (symbol, start_date, end_date):
     response = requests.get(f"{base_url}historical-price-full/{symbol}?from={start_date}&to={end_date}&apikey={api_key}").json()
     response = response['historical']
     return response



In [261]:
filtered_data

Unnamed: 0,symbol,date,calendarYear,period,revenuePerShare,netIncomePerShare,operatingCashFlowPerShare,freeCashFlowPerShare,cashPerShare,bookValuePerShare,...,incomeQuality,grahamNumber,roic,returnOnTangibleAssets,grahamNetNet,workingCapital,tangibleAssetValue,netCurrentAssetValue,investedCapital,roe
0,AAPL,2022-09-24,2022,FY,24.317273,6.154614,7.532763,6.872426,2.978793,3.124822,...,1.223921,20.801964,0.586168,0.282924,-12.679296,-18577000000.0,50672000000.0,-166678000000.0,2.369533,1.969589
1,AAPL,2021-09-25,2021,FY,21.903541,5.669029,6.229346,5.565624,3.750553,3.777557,...,1.098838,21.950838,0.502938,0.269742,-10.978415,9355000000.0,63090000000.0,-153076000000.0,1.976843,1.500713
2,AAPL,2020-09-26,2020,FY,15.820258,3.308587,4.649230,4.228014,5.241031,3.765477,...,1.405201,16.742586,0.319077,0.177256,-7.923629,38321000000.0,65339000000.0,-114836000000.0,1.720810,0.878664
3,AAPL,2019-09-28,2019,FY,14.085283,2.991446,3.756685,3.188508,5.443948,4.898834,...,1.255809,18.158425,0.270668,0.163230,-6.012830,57101000000.0,90488000000.0,-85209000000.0,1.194048,0.610645
4,AAPL,2018-09-29,2018,FY,13.399334,3.003354,3.906565,3.234920,3.344902,5.405593,...,1.300734,19.112441,0.261218,0.162775,-7.746774,14473000000.0,107147000000.0,-127239000000.0,1.068467,0.555601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,RY,2022-10-31,2022,FY,34.748592,11.252061,15.632057,13.850992,171.406914,76.987634,...,1.388119,139.610402,0.055704,0.008318,-1103.383562,107252000000,89815000000,-1542209000000,3.934483,0.146154
494,RY,2021-10-31,2021,FY,34.778842,11.259928,42.857654,41.322912,162.197589,69.271938,...,3.803364,132.476163,0.039239,0.009484,-954.570809,143553000000,83437000000,-1354003000000,3.091692,0.162547
495,RY,2020-10-31,2020,FY,33.080626,8.028569,97.491072,95.644754,209.032140,60.863184,...,12.137711,104.854643,0.062636,0.007107,-861.121977,226169000000,70713000000,-1221507000000,1.044367,0.131912
496,RY,2019-10-31,2019,FY,32.009111,8.963053,9.942298,8.366445,116.481354,58.213146,...,1.108305,108.350214,0.066733,0.009101,-810.847350,79118000000,67715000000,-1158454000000,1.307640,0.153970


In [125]:
#correlation analysis
dimensions = filtered_data.iloc[:,4:]
non_float_columns = ['enterpriseValue', 'evToOperatingCashFlow', 'evToFreeCashFlow', 'earningsYield', 'workingCapital', 'tangibleAssetValue', 'netCurrentAssetValue']

for column in non_float_columns:
    # Convert the column to numeric, coerce non-numeric values to NaN
    dimensions[column] = pd.to_numeric(dimensions[column], errors='coerce')

# Now, you can check for NaN values in these columns
non_float_values = dimensions[non_float_columns][dimensions[non_float_columns].isna().any(axis=1)]

# Display rows with non-float values
non_float_values

Unnamed: 0,enterpriseValue,evToOperatingCashFlow,evToFreeCashFlow,earningsYield,workingCapital,tangibleAssetValue,netCurrentAssetValue


In [126]:
dimensions_corr = dimensions.corr()


# Find pairs with correlation greater than 0.9
highly_correlated_pairs = set()
for i in range(len(dimensions_corr.columns)):
    for j in range(i):
        if abs(dimensions_corr.iloc[i, j]) > 0.9 or abs(dimensions_corr.iloc[i, j]) < -0.9:
            # Add the pair of dimensions to the set
            pair = (dimensions_corr.columns[i], dimensions_corr.columns[j])
            highly_correlated_pairs.add(pair)

# Flatten the set of pairs to get a list of all dimensions involved
all_dimensions = [dimension for pair in highly_correlated_pairs for dimension in pair]

# Count the occurrences of each dimension
dimension_counts = pd.Series(all_dimensions).value_counts()

# Display the counts
print(dimension_counts)


netCurrentAssetValue          11
operatingCashFlowPerShare     11
cashPerShare                  10
revenuePerShare               10
tangibleBookValuePerShare     10
tangibleAssetValue            10
workingCapital                10
shareholdersEquityPerShare    10
grahamNumber                  10
netIncomePerShare             10
bookValuePerShare             10
roe                            2
pbRatio                        2
ptbRatio                       2
freeCashFlowPerShare           1
evToSales                      1
pocfratio                      1
evToOperatingCashFlow          1
grahamNetNet                   1
pfcfRatio                      1
debtToEquity                   1
investedCapital                1
evToFreeCashFlow               1
marketCap                      1
enterpriseValue                1
priceToSalesRatio              1
dtype: int64


In [127]:
# Filter dimensions with counts higher than 9
highly_correlated_dimensions = dimension_counts[dimension_counts >= 10]

# Get the names as an array
highly_correlated_names_array = highly_correlated_dimensions.index.values

# Display the array of names
print(highly_correlated_names_array)

['netCurrentAssetValue' 'operatingCashFlowPerShare' 'cashPerShare'
 'revenuePerShare' 'tangibleBookValuePerShare' 'tangibleAssetValue'
 'workingCapital' 'shareholdersEquityPerShare' 'grahamNumber'
 'netIncomePerShare' 'bookValuePerShare']


In [128]:
# cleaning dimensions
final_dimensions = dimensions.drop(columns=highly_correlated_names_array)
final_dimensions = pd.concat([filtered_data.iloc[:,:3],final_dimensions], axis=1)

In [129]:
final_dimensions

Unnamed: 0,symbol,date,calendarYear,freeCashFlowPerShare,interestDebtPerShare,marketCap,enterpriseValue,peRatio,priceToSalesRatio,pocfratio,...,debtToEquity,debtToAssets,netDebtToEBITDA,interestCoverage,incomeQuality,roic,returnOnTangibleAssets,grahamNetNet,investedCapital,roe
0,AAPL,2022-09-24,2022,6.872426,7.585118,2.439367e+12,2.535790e+12,24.441824,6.186138,19.970097,...,2.369533,0.340375,0.738641,40.749574,1.223921,0.586168,0.282924,-12.679296,2.369533,1.969589
1,AAPL,2021-09-25,2021,5.565624,7.626006,2.453751e+12,2.543530e+12,25.916254,6.707591,23.585141,...,1.976843,0.355323,0.746708,41.190548,1.098838,0.502938,0.269742,-10.978415,1.976843,1.500713
2,AAPL,2020-09-26,2020,4.228014,6.645240,1.948296e+12,2.022716e+12,33.935934,7.097229,24.150233,...,1.720810,0.347145,0.962195,23.072746,1.405201,0.319077,0.177256,-7.923629,1.720810,0.878664
3,AAPL,2019-09-28,2019,3.188508,6.043039,1.010474e+12,1.069677e+12,18.287144,3.883841,14.562039,...,1.194048,0.319178,0.774128,17.877517,1.255809,0.270668,0.163230,-6.012830,1.194048,0.610645
4,AAPL,2018-09-29,2018,3.234920,5.939155,1.118627e+12,1.207197e+12,18.790660,4.211777,14.446197,...,1.068467,0.313030,1.082750,21.882099,1.300734,0.261218,0.162775,-7.746774,1.068467,0.555601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,RY,2022-10-31,2022,13.850992,315.768701,1.767599e+11,4.215279e+11,11.191587,3.623986,8.055780,...,3.934483,0.221767,5.998922,2.113105,1.388119,0.055704,0.008318,-1103.383562,3.934483,0.146154
494,RY,2021-10-31,2021,41.322912,219.884536,1.842057e+11,2.957697e+11,11.485577,3.718548,3.017589,...,3.091692,0.178775,4.806514,2.532113,3.803364,0.039239,0.009484,-954.570809,3.091692,0.162547
495,RY,2020-10-31,2020,95.644754,73.429243,1.318838e+11,-1.615019e+10,11.536373,2.799843,0.950042,...,1.044367,0.055713,-9.454813,1.023918,12.137711,0.062636,0.007107,-861.121977,1.044367,0.131912
496,RY,2019-10-31,2019,8.366445,91.165260,1.520463e+11,9.720628e+10,11.823194,3.310680,10.658695,...,1.307640,0.076433,-3.207018,0.736796,1.108305,0.066733,0.009101,-810.847350,1.307640,0.153970


In [97]:
# closing_price = price_data.loc[:,['symbol','date','close']]

# closing_price['date'] = pd.to_datetime(closing_price['date'])
# final_dimensions['date'] = pd.to_datetime(final_dimensions['date'])

In [95]:
final_dimensions['date']

0     2022-09-24
1     2021-09-25
2     2020-09-26
3     2019-09-28
4     2018-09-29
         ...    
493   2022-10-31
494   2021-10-31
495   2020-10-31
496   2019-10-31
497   2018-10-31
Name: date, Length: 498, dtype: datetime64[ns]

In [177]:
#getting 1 day price data for a symbol
def Get_historical_data (symbol, start_date, end_date):
     response = requests.get(f"{base_url}historical-price-full/{symbol}?from={start_date}&to={end_date}&apikey={api_key}").json()
     if response:
          return response['historical']
     else :
          return False


Timestamp('2022-09-24 00:00:00')

In [136]:
start_date_str = final_dimensions['date'][0]

# Convert the formatted date string to a datetime.date object
start_date = datetime.strptime(start_date_str, "%Y-%m-%d").date()


datetime.date(2018, 6, 1)

In [164]:
x = final_dimensions['date'][1]
x

'2021-09-25'

In [183]:
from datetime import datetime, timedelta

date_str = final_dimensions['date'][1]
date_obj = datetime.strptime(date_str, "%Y-%m-%d")
modified_date = date_obj + timedelta(days=30)
modified_date_str = modified_date.strftime("%Y-%m-%d")

response = Get_historical_data ("AAPL", modified_date_str, modified_date_str)
response[0]['close']

148.64

In [191]:
# prices = pd.DataFrame(columns = ['symbol'date'])
final_dimensions['after'] = 1
final_dimensions['before'] = 1
for r in range(len(final_dimensions)):
     date_obj = datetime.strptime(final_dimensions.loc[r,'date'], "%Y-%m-%d")
     for i in range(30, 2, -1):
          modified_date = date_obj + timedelta(days=i)
          modified_date_str = modified_date.strftime("%Y-%m-%d")
          response = Get_historical_data("AAPL", modified_date_str, modified_date_str)
          if response != False:
               break
     final_dimensions.loc[r,'after'] = response[0]['close']
     for i in range(30, 2, -1):
          modified_date = date_obj - timedelta(days=i)
          modified_date_str = modified_date.strftime("%Y-%m-%d")
          response = Get_historical_data("AAPL", modified_date_str, modified_date_str)
          if response != False:
               break
     final_dimensions.loc[r,'before'] = response[0]['close']

In [192]:
final_dimensions

Unnamed: 0,symbol,date,calendarYear,freeCashFlowPerShare,interestDebtPerShare,marketCap,enterpriseValue,peRatio,priceToSalesRatio,pocfratio,...,netDebtToEBITDA,interestCoverage,incomeQuality,roic,returnOnTangibleAssets,grahamNetNet,investedCapital,roe,after,before
0,AAPL,2022-09-24,2022,6.872426,7.585118,2.439367e+12,2.535790e+12,24.441824,6.186138,19.970097,...,0.738641,40.749574,1.223921,0.586168,0.282924,-12.679296,2.369533,1.969589,149.45,170.03
1,AAPL,2021-09-25,2021,5.565624,7.626006,2.453751e+12,2.543530e+12,25.916254,6.707591,23.585141,...,0.746708,41.190548,1.098838,0.502938,0.269742,-10.978415,1.976843,1.500713,148.64,147.54
2,AAPL,2020-09-26,2020,4.228014,6.645240,1.948296e+12,2.022716e+12,33.935934,7.097229,24.150233,...,0.962195,23.072746,1.405201,0.319077,0.177256,-7.923629,1.720810,0.878664,115.05,125.01
3,AAPL,2019-09-28,2019,3.188508,6.043039,1.010474e+12,1.069677e+12,18.287144,3.883841,14.562039,...,0.774128,17.877517,1.255809,0.270668,0.163230,-6.012830,1.194048,0.610645,62.26,52.25
4,AAPL,2018-09-29,2018,3.234920,5.939155,1.118627e+12,1.207197e+12,18.790660,4.211777,14.446197,...,1.082750,21.882099,1.300734,0.261218,0.162775,-7.746774,1.068467,0.555601,53.06,56.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,RY,2022-10-31,2022,13.850992,315.768701,1.767599e+11,4.215279e+11,11.191587,3.623986,8.055780,...,5.998922,2.113105,1.388119,0.055704,0.008318,-1103.383562,3.934483,0.146154,148.03,142.45
494,RY,2021-10-31,2021,41.322912,219.884536,1.842057e+11,2.957697e+11,11.485577,3.718548,3.017589,...,4.806514,2.532113,3.803364,0.039239,0.009484,-954.570809,3.091692,0.162547,165.30,142.65
495,RY,2020-10-31,2020,95.644754,73.429243,1.318838e+11,-1.615019e+10,11.536373,2.799843,0.950042,...,-9.454813,1.023918,12.137711,0.062636,0.007107,-861.121977,1.044367,0.131912,119.05,116.79
496,RY,2019-10-31,2019,8.366445,91.165260,1.520463e+11,9.720628e+10,11.823194,3.310680,10.658695,...,-3.207018,0.736796,1.108305,0.066733,0.009101,-810.847350,1.307640,0.153970,66.81,56.15


In [194]:
final_dimensions['change'] = final_dimensions['after'] - final_dimensions['before']
final_dimensions['change'] = final_dimensions['change'].apply(lambda x : 1 if x>0 else 0)
final_dimensions

Unnamed: 0,symbol,date,calendarYear,freeCashFlowPerShare,interestDebtPerShare,marketCap,enterpriseValue,peRatio,priceToSalesRatio,pocfratio,...,interestCoverage,incomeQuality,roic,returnOnTangibleAssets,grahamNetNet,investedCapital,roe,after,before,change
0,AAPL,2022-09-24,2022,6.872426,7.585118,2.439367e+12,2.535790e+12,24.441824,6.186138,19.970097,...,40.749574,1.223921,0.586168,0.282924,-12.679296,2.369533,1.969589,149.45,170.03,0
1,AAPL,2021-09-25,2021,5.565624,7.626006,2.453751e+12,2.543530e+12,25.916254,6.707591,23.585141,...,41.190548,1.098838,0.502938,0.269742,-10.978415,1.976843,1.500713,148.64,147.54,1
2,AAPL,2020-09-26,2020,4.228014,6.645240,1.948296e+12,2.022716e+12,33.935934,7.097229,24.150233,...,23.072746,1.405201,0.319077,0.177256,-7.923629,1.720810,0.878664,115.05,125.01,0
3,AAPL,2019-09-28,2019,3.188508,6.043039,1.010474e+12,1.069677e+12,18.287144,3.883841,14.562039,...,17.877517,1.255809,0.270668,0.163230,-6.012830,1.194048,0.610645,62.26,52.25,1
4,AAPL,2018-09-29,2018,3.234920,5.939155,1.118627e+12,1.207197e+12,18.790660,4.211777,14.446197,...,21.882099,1.300734,0.261218,0.162775,-7.746774,1.068467,0.555601,53.06,56.26,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,RY,2022-10-31,2022,13.850992,315.768701,1.767599e+11,4.215279e+11,11.191587,3.623986,8.055780,...,2.113105,1.388119,0.055704,0.008318,-1103.383562,3.934483,0.146154,148.03,142.45,1
494,RY,2021-10-31,2021,41.322912,219.884536,1.842057e+11,2.957697e+11,11.485577,3.718548,3.017589,...,2.532113,3.803364,0.039239,0.009484,-954.570809,3.091692,0.162547,165.30,142.65,1
495,RY,2020-10-31,2020,95.644754,73.429243,1.318838e+11,-1.615019e+10,11.536373,2.799843,0.950042,...,1.023918,12.137711,0.062636,0.007107,-861.121977,1.044367,0.131912,119.05,116.79,1
496,RY,2019-10-31,2019,8.366445,91.165260,1.520463e+11,9.720628e+10,11.823194,3.310680,10.658695,...,0.736796,1.108305,0.066733,0.009101,-810.847350,1.307640,0.153970,66.81,56.15,1


In [198]:
final_dimensions.iloc[:,3:29]
final_dimensions['change']

Unnamed: 0,freeCashFlowPerShare,interestDebtPerShare,marketCap,enterpriseValue,peRatio,priceToSalesRatio,pocfratio,pfcfRatio,pbRatio,ptbRatio,...,debtToEquity,debtToAssets,netDebtToEBITDA,interestCoverage,incomeQuality,roic,returnOnTangibleAssets,grahamNetNet,investedCapital,roe
0,6.872426,7.585118,2.439367e+12,2.535790e+12,24.441824,6.186138,19.970097,21.888924,48.140340,48.140340,...,2.369533,0.340375,0.738641,40.749574,1.223921,0.586168,0.282924,-12.679296,2.369533,1.969589
1,5.565624,7.626006,2.453751e+12,2.543530e+12,25.916254,6.707591,23.585141,26.397759,38.892865,38.892865,...,1.976843,0.355323,0.746708,41.190548,1.098838,0.502938,0.269742,-10.978415,1.976843,1.500713
2,4.228014,6.645240,1.948296e+12,2.022716e+12,33.935934,7.097229,24.150233,26.556204,29.818270,29.818270,...,1.720810,0.347145,0.962195,23.072746,1.405201,0.319077,0.177256,-7.923629,1.720810,0.878664
3,3.188508,6.043039,1.010474e+12,1.069677e+12,18.287144,3.883841,14.562039,17.156928,11.166944,11.166944,...,1.194048,0.319178,0.774128,17.877517,1.255809,0.270668,0.163230,-6.012830,1.194048,0.610645
4,3.234920,5.939155,1.118627e+12,1.207197e+12,18.790660,4.211777,14.446197,17.445561,10.440113,10.440113,...,1.068467,0.313030,1.082750,21.882099,1.300734,0.261218,0.162775,-7.746774,1.068467,0.555601
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,13.850992,315.768701,1.767599e+11,4.215279e+11,11.191587,3.623986,8.055780,9.091654,1.635697,1.635697,...,3.934483,0.221767,5.998922,2.113105,1.388119,0.055704,0.008318,-1103.383562,3.934483,0.146154
494,41.322912,219.884536,1.842057e+11,2.957697e+11,11.485577,3.718548,3.017589,3.129663,1.866943,1.866943,...,3.091692,0.178775,4.806514,2.532113,3.803364,0.039239,0.009484,-954.570809,3.091692,0.162547
495,95.644754,73.429243,1.318838e+11,-1.615019e+10,11.536373,2.799843,0.950042,0.968381,1.521783,1.521783,...,1.044367,0.055713,-9.454813,1.023918,12.137711,0.062636,0.007107,-861.121977,1.044367,0.131912
496,8.366445,91.165260,1.520463e+11,9.720628e+10,11.823194,3.310680,10.658695,12.666301,1.820412,1.820412,...,1.307640,0.076433,-3.207018,0.736796,1.108305,0.066733,0.009101,-810.847350,1.307640,0.153970


In [199]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Assuming final_dimensions is your DataFrame

# Selecting inputs (features) and target variable
X = final_dimensions.iloc[:, 3:29]  # Features
y = final_dimensions['change']       # Target variable

# Splitting the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initializing the logistic regression model
model = LogisticRegression()

# Fitting the model to the training data
model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(classification_report_str)


Accuracy: 0.42
Confusion Matrix:
[[55  8]
 [79  8]]
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.87      0.56        63
           1       0.50      0.09      0.16        87

    accuracy                           0.42       150
   macro avg       0.46      0.48      0.36       150
weighted avg       0.46      0.42      0.32       150



'              precision    recall  f1-score   support\n\n           0       0.41      0.87      0.56        63\n           1       0.50      0.09      0.16        87\n\n    accuracy                           0.42       150\n   macro avg       0.46      0.48      0.36       150\nweighted avg       0.46      0.42      0.32       150\n'

In [215]:
df_feature_results = pd.DataFrame(columns=['features','accuracy'])



In [244]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# feature selection with RFE

df_feature_results = pd.DataFrame(columns=['features', 'accuracy', 'f1_score', 'accuracy_class_0', 'accuracy_class_1'])

for i in range(1, 26):
    X = final_dimensions.iloc[:, 3:29]  # Features
    y = final_dimensions['change']  # Target variable

    model = LogisticRegression(random_state=42, max_iter=1000)  # Increase max_iter
    rfe = RFE(model, n_features_to_select=i)
    rfe.fit(X, y)
    selected_column_names = X.columns[rfe.support_].to_numpy()

    # Logistic regression model
    X = final_dimensions.loc[:, selected_column_names]
    y = final_dimensions['change']

    # Scaling the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy_class_0 = precision_score(y_test, y_pred, pos_label=0)
    accuracy_class_1 = precision_score(y_test, y_pred, pos_label=1)

    df_feature_results = df_feature_results.append({
        'features': selected_column_names,
        'accuracy': accuracy,
        'f1_score': f1,
        'accuracy_class_0': accuracy_class_0,
        'accuracy_class_1': accuracy_class_1
    }, ignore_index=True)

# Print the resulting DataFrame
df_feature_results


Unnamed: 0,features,accuracy,f1_score,accuracy_class_0,accuracy_class_1
0,[marketCap],0.566667,0.723404,0.0,0.574324
1,"[marketCap, enterpriseValue]",0.566667,0.723404,0.0,0.574324
2,"[marketCap, enterpriseValue, evToFreeCashFlow]",0.566667,0.723404,0.0,0.574324
3,"[marketCap, enterpriseValue, pfcfRatio, evToFr...",0.566667,0.723404,0.0,0.574324
4,"[marketCap, enterpriseValue, peRatio, pfcfRati...",0.566667,0.723404,0.0,0.574324
5,"[marketCap, enterpriseValue, peRatio, pfcfRati...",0.566667,0.723404,0.0,0.574324
6,"[marketCap, enterpriseValue, peRatio, pocfrati...",0.566667,0.723404,0.0,0.574324
7,"[marketCap, enterpriseValue, peRatio, pocfrati...",0.566667,0.723404,0.0,0.574324
8,"[marketCap, enterpriseValue, peRatio, pocfrati...",0.573333,0.726496,0.333333,0.578231
9,"[interestDebtPerShare, marketCap, enterpriseVa...",0.573333,0.724138,0.4,0.57931


### These are the best combinations of predictors:


In [248]:
print(df_feature_results.loc[[16,17],'features'])


16    [interestDebtPerShare, marketCap, enterpriseVa...
17    [freeCashFlowPerShare, interestDebtPerShare, m...
Name: features, dtype: object


In [275]:
final_features_1 = df_feature_results.loc[16, 'features']
final_features_2 = df_feature_results.loc[17, 'features']

X = final_dimensions.loc[:, final_features_1]  # Features
y = final_dimensions['change']  # Target variable

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Get the coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_
coefficients_dict = dict(zip(final_features_1, coefficients))
print("first group")
rounded_coefficients_dict = {key: round(value, 3) for key, value in coefficients_dict.items()}
print(rounded_coefficients_dict)
print("Intercept :", np.round(intercept,3))

X = final_dimensions.loc[:, final_features_2]  # Features
y = final_dimensions['change']  # Target variable

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Get the coefficients and intercept
coefficients = model.coef_[0]
intercept = model.intercept_
coefficients_dict = dict(zip(final_features_1, coefficients))
print("second group")
rounded_coefficients_dict = {key: round(value, 3) for key, value in coefficients_dict.items()}
print(rounded_coefficients_dict)
print("Intercept :", np.round(intercept,3))


first group
{'interestDebtPerShare': -0.247, 'marketCap': 0.102, 'enterpriseValue': 0.051, 'peRatio': 0.121, 'priceToSalesRatio': 0.317, 'pocfratio': 0.36, 'pfcfRatio': 0.244, 'pbRatio': 0.011, 'ptbRatio': 0.011, 'evToSales': -0.089, 'enterpriseValueOverEBITDA': 0.093, 'evToOperatingCashFlow': -0.273, 'evToFreeCashFlow': 0.308, 'netDebtToEBITDA': 0.029, 'interestCoverage': -0.105, 'incomeQuality': 0.101, 'grahamNetNet': -0.044}
Intercept : [0.375]
second group
{'interestDebtPerShare': -0.106, 'marketCap': -0.148, 'enterpriseValue': 0.051, 'peRatio': 0.026, 'priceToSalesRatio': 0.12, 'pocfratio': 0.325, 'pfcfRatio': 0.362, 'pbRatio': 0.241, 'ptbRatio': 0.012, 'evToSales': 0.012, 'enterpriseValueOverEBITDA': -0.096, 'evToOperatingCashFlow': 0.096, 'evToFreeCashFlow': -0.276, 'netDebtToEBITDA': 0.307, 'interestCoverage': 0.025, 'incomeQuality': -0.104, 'grahamNetNet': 0.103}
Intercept : [0.374]


In [270]:
round(coefficients_dict['interestDebtPerShare'], 3)

-0.106

In [240]:
# feature selection with RFE with scalar

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

# feature selection with RFE with scaler
# Load the dataset
X_original = final_dimensions.iloc[:, 3:29]  # Features
y = final_dimensions['change']  # Target variable
# Scaling the features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_original)
df_feature_results = pd.DataFrame(columns=['features', 'accuracy'])
for i in range(1, 26):
    model = LogisticRegression(random_state=42, max_iter=1000)  # Increase max_iter
    rfe = RFE(model, n_features_to_select=i)
    rfe.fit(X_scaled, y)
    selected_column_names = X_original.columns[rfe.support_].to_numpy()
    # Logistic regression model
    X = X_original.loc[:, selected_column_names]
    X_scaled_subset = X_scaled[:, rfe.support_]
    X_train, X_test, y_train, y_test = train_test_split(X_scaled_subset, y, test_size=0.3, random_state=42)
    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    df_feature_results = df_feature_results.append({'features': selected_column_names, 'accuracy': accuracy},
                                                   ignore_index=True)

# Print the resulting DataFrame
print(df_feature_results)


                                             features  accuracy
0                                         [evToSales]  0.580000
1                                [peRatio, evToSales]  0.580000
2                          [peRatio, evToSales, roic]  0.580000
3        [peRatio, evToSales, interestCoverage, roic]  0.580000
4   [peRatio, priceToSalesRatio, evToSales, intere...  0.580000
5   [peRatio, priceToSalesRatio, evToSales, evToFr...  0.580000
6   [peRatio, priceToSalesRatio, evToSales, evToFr...  0.580000
7   [peRatio, priceToSalesRatio, pfcfRatio, evToSa...  0.580000
8   [peRatio, priceToSalesRatio, pfcfRatio, evToSa...  0.580000
9   [peRatio, priceToSalesRatio, pfcfRatio, evToSa...  0.580000
10  [peRatio, priceToSalesRatio, pocfratio, pfcfRa...  0.580000
11  [peRatio, priceToSalesRatio, pocfratio, pfcfRa...  0.580000
12  [peRatio, priceToSalesRatio, pocfratio, pfcfRa...  0.580000
13  [peRatio, priceToSalesRatio, pocfratio, pfcfRa...  0.580000
14  [peRatio, priceToSalesRatio, pocfrat

In [241]:
#Logistic regression whith chi2 and Kbest with scaling
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import MinMaxScaler
# Load the dataset
X_original = final_dimensions.iloc[:, 3:29]  # Features
y = final_dimensions['change']  # Target variable
# Scaling the features using MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_original)
df_feature_results = pd.DataFrame(columns=['features', 'accuracy'])
for i in range(1, 26):
     selector = SelectKBest(chi2, k=i)
     X_new = selector.fit_transform(X_scaled, y)
     selected_features_mask = selector.get_support()
     selected_column_names = X_original.columns[selected_features_mask].to_numpy()
     # Logistic regression model
     X = X_original.loc[:, selected_column_names]
     X_scaled_subset = X_scaled[:, rfe.support_]
     X_train, X_test, y_train, y_test = train_test_split(X_scaled_subset, y, test_size=0.3, random_state=42)
     model = LogisticRegression(random_state=42, max_iter=1000)
     model.fit(X_train, y_train)
     y_pred = model.predict(X_test)
     accuracy = accuracy_score(y_test, y_pred)
     df_feature_results = df_feature_results.append({'features': selected_column_names, 'accuracy': accuracy},ignore_index=True)
# Print the resulting DataFrame
print(df_feature_results)



                                             features  accuracy
0                                 [priceToSalesRatio]  0.553333
1                      [priceToSalesRatio, evToSales]  0.553333
2    [priceToSalesRatio, evToSales, interestCoverage]  0.553333
3   [interestDebtPerShare, priceToSalesRatio, evTo...  0.553333
4   [interestDebtPerShare, priceToSalesRatio, evTo...  0.553333
5   [interestDebtPerShare, peRatio, priceToSalesRa...  0.553333
6   [interestDebtPerShare, peRatio, priceToSalesRa...  0.553333
7   [interestDebtPerShare, peRatio, priceToSalesRa...  0.553333
8   [interestDebtPerShare, peRatio, priceToSalesRa...  0.553333
9   [interestDebtPerShare, peRatio, priceToSalesRa...  0.553333
10  [interestDebtPerShare, peRatio, priceToSalesRa...  0.553333
11  [interestDebtPerShare, peRatio, priceToSalesRa...  0.553333
12  [interestDebtPerShare, peRatio, priceToSalesRa...  0.553333
13  [freeCashFlowPerShare, interestDebtPerShare, p...  0.553333
14  [freeCashFlowPerShare, interestDebtP