# Простейшие алгоритмы классификации

## Подготовка данных.

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor,RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score,roc_curve,auc,RocCurveDisplay, mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score
import sketch
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

from sklearn.impute import KNNImputer

In [None]:
def get_price_var_col(year):
    return f"{year+1} PRICE VAR [%]"

In [None]:
data2014 = pd.read_csv(r'C:\Учеба\Диплом\2014_Financial_Data.csv')

In [None]:
data2015 = pd.read_csv(r'C:\Учеба\Диплом\2015_Financial_Data.csv')

In [None]:
data2016 = pd.read_csv(r'C:\Учеба\Диплом\2016_Financial_Data.csv')

In [None]:
data2017 = pd.read_csv(r'C:\Учеба\Диплом\2017_Financial_Data.csv')

In [None]:
data2018 = pd.read_csv(r'C:\Учеба\Диплом\2018_Financial_Data.csv')

In [None]:
data2014.isna().sum().sort_values(ascending=False)

In [None]:
data2014[data2014.columns[data2014.isna().sum()!=0]].isna().transpose().sum().sort_values()

In [None]:
data2014[].isna().transpose().all().sum()

In [None]:
(~data2018['Unnamed: 0'].isin(data2014['Unnamed: 0'])).sum()

In [None]:
(~data2014['Unnamed: 0'].isin(data2018['Unnamed: 0'])).sum()

In [None]:
data2014['Unnamed: 0'].sort_values()

In [None]:
sns.scatterplot([np.asarray(list(data2014['Revenue'].values)),np.asarray(list(data2018['Revenue'].values))])

In [None]:
list(data2015['Revenue'].values)

In [None]:
data2014['Revenue'].fillna(data2014['Revenue'].mean(), inplace = True)
data2014.isnull().sum()

In [None]:
data2014.head()

In [None]:
data2014copy=data2014

In [None]:
data2014copy.dtypes.value_counts()

In [None]:
data2014['Free Cash Flow margin']

In [None]:
object_cols=data2014copy.select_dtypes(include=['object']).columns

In [None]:
data2014[object_cols[1]].value_counts()

In [None]:
data2014copy=pd.get_dummies(data2014copy, columns=[object_cols[1]],dtype=int)

In [None]:
object_cols[0]

In [None]:
data2014copy[object_cols[0]].value_counts()

In [None]:
data2014copy.columns[-1]

In [None]:
data2014copy.drop(columns=[object_cols[0]],axis=1,inplace=True)

In [None]:
data2014copy.isnull().sum()

In [None]:
data2014copy.dtypes.value_counts()

In [None]:
float_columns=data2014copy.select_dtypes(include=['float64']).columns

In [None]:
for i in float_columns:
    data2014copy[i].fillna(data2014copy[i].median(), inplace=True)

In [None]:
data2014copy.isnull().sum().sum()

In [None]:
list(data2014copy.columns)

In [None]:
sns.boxplot(data2014copy['Revenue Growth'])

In [None]:
list(data2014copy['Revenue Growth'].sort_values(ascending=False).values)

In [None]:
sns.boxplot(data2014copy['Revenue Growth'])

In [None]:
data2014copy['Revenue Growth'].where(data2014copy['Revenue Growth']<40000).hist(bins=30)

### Интерквартильный интервал

In [None]:
q3=data2014copy['Revenue Growth'].quantile(0.75)

In [None]:
q1=data2014copy['Revenue Growth'].quantile(0.25)

In [None]:
irq=q3-q1

In [None]:
mn=data2014copy['Revenue Growth'].mean()

In [None]:
[mn-3*irq, mn+3*irq]

### Правило трёх сигм

In [None]:
mn=data2014copy['Revenue Growth'].mean()

In [None]:
std=data2014copy['Revenue Growth'].std()

In [None]:
[mn-3*std,mn+3*std]

In [None]:
data2014copy[]

### Z-score

In [None]:
from scipy.stats import zscore
zz=data2014copy[['Revenue Growth']].apply(zscore)

In [None]:
zz.sort_values('Revenue Growth',ascending=False)

## Стандарт скалим, и если надо удаляем outliers, где значение (он же z-score), и удалить если по модулю больше 3

In [None]:
scaler.fit_transform(np.asarray([0,0,1,1,0]).reshape(-1,1))

In [None]:
scaler = StandardScaler()
df = data2014copy
df[float_columns] = scaler.fit_transform(data2014copy[float_columns])

In [None]:
df['Class']

#### Удаляем outliers

In [None]:
df=df[abs(df)<3].dropna()

### Разделение на трэйн и тест

In [None]:
y=df['Class']
X=df.drop(columns=['Class'])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)

### Регрессия

In [None]:
logreg=LogisticRegression()

In [None]:
logreg.fit(X_train, Y_train)

In [None]:
predict=logreg.predict(X_test)

In [None]:
accuracy_score(Y_test,predict)

In [None]:
confusion_matrix(Y_test, predict)

In [None]:
roc_auc_score(Y_test,predict)

In [None]:
fpr, tpr, _ = roc_curve(Y_test,predict)

In [None]:
roc_auc = auc(fpr,tpr)

In [None]:
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
model=Perceptron()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
model=DecisionTreeClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
model=RandomForestClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
def preprocess_cf(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    df_copy = df
    object_cols=df_copy.select_dtypes(include=['object']).columns
    df_copy=pd.get_dummies(df_copy, columns=['Sector'],dtype=int)
    df_copy.drop(columns=['Unnamed: 0', price_var_col],axis=1,inplace=True)
    float_columns=df_copy.select_dtypes(include=['float64']).columns
    for i in float_columns:
        df_copy[i].fillna(df_copy[i].median(), inplace=True)
    scaler = StandardScaler()
    df2 = df_copy
    df2[float_columns] = scaler.fit_transform(df_copy[float_columns])
    df2=df2[abs(df2)<3].dropna()
    return df2

In [None]:
data2014_pp = preprocess_cf(data2014,f"{2014+1} PRICE VAR [%]")

In [None]:
data2014_pp

In [None]:
y=data2014_pp['Class']
X=data2014_pp.drop(columns=['Class'])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)

In [None]:
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

## Обучение модели на регрессию, другая Y колонка

In [None]:
def preprocess(df: pd.DataFrame, price_var_col) -> pd.DataFrame:
    df_copy = df
    object_cols=df_copy.select_dtypes(include=['object']).columns
    df_copy=pd.get_dummies(df_copy, columns=['Sector'],dtype=int)
    df_copy.drop(columns=['Unnamed: 0'],axis=1,inplace=True)
    df_copy.drop(columns=['Class', price_var_col], axis=1,inplace=True)
    float_columns=list(df_copy.select_dtypes(include=['float64']).columns)
    # float_columns.remove(price_var_col)
    for i in float_columns:
        df_copy[i].fillna(df_copy[i].median(), inplace=True)
    scaler = StandardScaler()
    df2 = df_copy
    df2[float_columns] = scaler.fit_transform(df2[float_columns])
    # df3 = df2[float_columns]
    # index_list=df3[abs(df3)<3].dropna().index
    # df2.drop(index_list,inplace=True)
    df2 = df2[abs(df2)<3].dropna()
    return df2

In [None]:
price_var_col = f'{2014+1} PRICE VAR [%]'
data2014_pp=preprocess(data2014,price_var_col)

In [None]:
y=data2014[price_var_col]

In [None]:
data2014_ppy = data2014_pp.join(y)

### Выбросы за квантилями 1% и 99%

In [None]:
lower_quantile = dff.quantile(0.01)
upper_quantile = dff.quantile(0.99)

filtered_df = dff[(dff >= lower_quantile) & (dff <= upper_quantile)].dropna()

In [None]:
filtered_df

In [None]:
df

### Продолжение

In [None]:
Y = data2014_ppy[price_var_col]
X = data2014_ppy.drop(columns=[price_var_col])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3,random_state=17)

In [None]:
model=GradientBoostingRegressor(max_depth=6)
model.fit(X_train, Y_train)
predict=model.predict(X_test)

In [None]:
print(mean_squared_error(Y_test,predict))

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2 = r2_score(Y_test,predict)

In [None]:
r2

In [None]:
model=RandomForestRegressor()
model.fit(X_train, Y_train)
predict=model.predict(X_test)

In [None]:
print(mean_squared_error(Y_test,predict))

### Проверим на 2018 году. Если получится сильно хуже, то просто объединим с добавлением колонки Year.

In [None]:
data2018 = pd.read_csv(r'C:\Учеба\Диплом\2018_Financial_Data.csv')
price_var_col = f'{2018+1} PRICE VAR [%]'
data2018_pp=preprocess(data2018,price_var_col)

In [None]:
y=data2018[price_var_col]
data2018_ppy = data2018_pp.join(y)

In [None]:
Y = data2018_ppy[price_var_col]
X = data2018_ppy.drop(columns=[price_var_col])

In [None]:
predict=model.predict(X)
print(mean_squared_error(Y,predict))

### ну получилось неприлично много

## Попытки улучшения

In [None]:
data2014[['Sector', 'Class']].groupby(['Sector']).value_counts()

In [None]:
data2014[['Sector', 'Class']].groupby(['Class']).value_counts()

In [None]:
tmp = data2014[['Sector', 'Class']]

In [None]:
tmp.sort_values

In [None]:
for i in tmp['Sector'].drop_duplicates():
    print(pd.DataFrame(tmp[tmp['Sector']==i].value_counts()).sort_values(by='Class')['count'])

In [None]:
sector_dict = {}

In [None]:
for i in tmp['Sector'].drop_duplicates():
    a=tmp[tmp['Sector']==i].value_counts().sort_index().values
    sector_dict[i] = a[1]/sum(a)

In [None]:
sector_dict

In [None]:
data2014

In [None]:
def preprocess_cf2(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    sector_dict = {}
    tmp = df[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df_copy = df
    df_copy['Sector'] = df_copy['Sector'].replace(sector_dict)
    df_copy.drop(columns=['Unnamed: 0', price_var_col],axis=1,inplace=True)
    float_columns=df_copy.select_dtypes(include=['float64']).columns
    for i in float_columns:
        df_copy[i].fillna(df_copy[i].median(), inplace=True)
    scaler = StandardScaler()
    df2 = df_copy
    df2[float_columns] = scaler.fit_transform(df_copy[float_columns])
    df2=df2[abs(df2)<3].dropna()
    return df2

In [None]:
data2014_pp = preprocess_cf2(data2014,f"{2014+1} PRICE VAR [%]")

In [None]:
data2014_pp

In [None]:
y=data2014_pp['Class']
X=data2014_pp.drop(columns=['Class'])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)

In [None]:
6.5+10+8.5+5.4

In [None]:
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
a=data2014_pp.corr()['Class']

In [None]:
a.plot()

In [None]:
a[abs(a)>0.15]

In [None]:
import random

In [None]:
col_list=random.sample(list(data2014_pp.columns),50)

In [None]:
print('Class' in col_list)
print('Sector' in col_list)

In [None]:
tmp = data2014_pp[col_list]

In [None]:
y=tmp['Class']
X=tmp.drop(columns=['Class'])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)

In [None]:
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
y=data2014_pp['Class']
X=data2014_pp.drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)

In [None]:
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
#creating Scoring parameter: 
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),'recall':make_scorer(recall_score)}

# A sample parameter

parameters = {
    "learning_rate": [0.075, 0.1, 0.125, 0.15, 0.175, 0.2]
    }
#passing the scoring function in the GridSearchCV
clf = GridSearchCV(GradientBoostingClassifier(), parameters,scoring=scoring,refit=False, n_jobs=-1, verbose=10)

clf.fit(X_train, Y_train)
#converting the clf.cv_results to dataframe
df=pd.DataFrame.from_dict(clf.cv_results_)
#here Possible inputs for cross validation is cv=2, there two split split0 and split1
df[['split0_test_accuracy','split1_test_accuracy','split0_test_precision','split1_test_precision','split0_test_recall','split1_test_recall']]

In [None]:
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

### Удаление данных, связанных с Growth

In [None]:
not_growth_columns=list(filter(lambda x: "growth" not in x.lower(),list(data2014.columns)))

In [None]:
data2014 = pd.read_csv(r'C:\Учеба\Диплом\2014_Financial_Data.csv')

In [None]:
tmp = data2014[not_growth_columns]

In [None]:
data2014_pp2 = preprocess_cf2(tmp,f"{2014+1} PRICE VAR [%]")

In [None]:
y=data2014_pp2['Class']
X=data2014_pp2.drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)

In [None]:
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

### Другие скалеры

In [None]:
def preprocess_cf2_no_growth_minmax(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    not_growth_columns=list(filter(lambda x: "growth" not in x.lower(),list(df.columns)))

    df_no_growth = df[not_growth_columns]
    
    sector_dict = {}
    tmp = df_no_growth[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df_copy = df_no_growth
    df_copy['Sector'] = df_copy['Sector'].replace(sector_dict)
    df_copy.drop(columns=['Unnamed: 0', price_var_col],axis=1,inplace=True)
    float_columns=df_copy.select_dtypes(include=['float64']).columns
    for i in float_columns:
        df_copy[i].fillna(df_copy[i].median(), inplace=True)
    scaler = MinMaxScaler()
    df2 = df_copy
    df2[float_columns] = scaler.fit_transform(df_copy[float_columns])
    # df2=df2[abs(df2)<3].dropna()
    return df2

In [None]:
def get_price_var_col(year):
    return f"{year+1} PRICE VAR [%]"

In [None]:
data2014_pp_mm = preprocess_cf2_no_growth_minmax(data2014, get_price_var_col(2014))

In [None]:
y=data2014_pp_mm['Class']
X=data2014_pp_mm.drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

### Robust

In [None]:
def preprocess_cf2_no_growth_robust(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    not_growth_columns=list(filter(lambda x: "growth" not in x.lower(),list(df.columns)))

    df_no_growth = df[not_growth_columns]
    
    sector_dict = {}
    tmp = df_no_growth[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df_copy = df_no_growth
    df_copy['Sector'] = df_copy['Sector'].replace(sector_dict)
    df_copy.drop(columns=['Unnamed: 0', price_var_col],axis=1,inplace=True)
    float_columns=list(df_copy.select_dtypes(include=['float64']).columns)
    float_columns.remove('Sector')
    for i in float_columns:
        df_copy[i].fillna(df_copy[i].median(), inplace=True)

    # df_copy[float_columns] = df_copy[float_columns][(df_copy[float_columns]>df_copy[float_columns].quantile(0.01)) & (df_copy[float_columns]<df_copy[float_columns].quantile(0.99))]
    scaler = RobustScaler(quantile_range=(0.01,0.99))
    # scaler = StandardScaler()
    df2 = df_copy
    df2[float_columns] = scaler.fit_transform(df_copy[float_columns])
    # df2=df2[abs(df2)<3].dropna()
    return df2

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_robust(data2014, get_price_var_col(2014))

In [None]:
float_columns = list(data2014_pp_rb.select_dtypes(include=['float64']).columns)
float_columns.remove('Sector')

In [None]:
lower = data2014_pp_rb[float_columns].quantile(0.01)
greater = data2014_pp_rb[float_columns].quantile(0.99)

In [None]:
data2014_pp_rb[float_columns] = (data2014_pp_rb[float_columns])[(data2014_pp_rb[float_columns] >= lower) & (data2014_pp_rb[float_columns] <= greater)].dropna()

In [None]:
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
def preprocess_cf2_no_growth_rb(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    not_growth_columns=list(filter(lambda x: "growth" not in x.lower(),list(df.columns)))

    df_no_growth = df[not_growth_columns]
    
    sector_dict = {}
    tmp = df_no_growth[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df_copy = df_no_growth
    df_copy['Sector'] = df_copy['Sector'].replace(sector_dict)
    df_copy.drop(columns=['Unnamed: 0', price_var_col],axis=1,inplace=True)
    float_columns=list(df_copy.select_dtypes(include=['float64']).columns)
    float_columns.remove('Sector')
    for i in float_columns:
        df_copy[i].fillna(df_copy[i].median(), inplace=True)

    # df_copy[float_columns] = df_copy[float_columns][(df_copy[float_columns]>df_copy[float_columns].quantile(0.01)) & (df_copy[float_columns]<df_copy[float_columns].quantile(0.99))]
    # scaler = RobustScaler(quantile_range=(0.01,0.99))
    scaler = RobustScaler()
    df2 = df_copy
    df2[float_columns] = scaler.fit_transform(df_copy[float_columns])

    # df2=df2[abs(df2)<3].dropna()

    lower = df2[float_columns].quantile(0.01)
    greater = df2[float_columns].quantile(0.99)
    df2[float_columns] = (df2[float_columns])[(df2[float_columns] >= lower) & (df2[float_columns] <= greater)].dropna()

    return df2

In [None]:
data2014 = pd.read_csv(r'C:\Учеба\Диплом\2014_Financial_Data.csv')

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_rb(data2014, get_price_var_col(2014))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

### Логарифм

In [None]:
data2014_pp_rb_log = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))

In [None]:
data2014_pp_rb_log.iloc[:,:10]

In [None]:
data2014_pp_rb_log.iloc[:,:10].hist()
plt.show()

In [None]:
data2014_pp_rb_log_positive = data2014_pp_rb_log-data2014_pp_rb_log.min()

In [None]:
data2014_pp_rb_log_positive_loged = np.log1p(data2014_pp_rb_log_positive)

In [None]:
data2014_pp_rb_log_positive_loged.iloc[:, :60].hist(figsize=(20,20))
plt.show()

In [None]:
def preprocess_cf2_no_growth_rb_log(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    not_growth_columns=list(filter(lambda x: "growth" not in x.lower(),list(df.columns)))

    df_no_growth = df[not_growth_columns]
    
    sector_dict = {}
    tmp = df_no_growth[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df_copy = df_no_growth
    df_copy['Sector'] = df_copy['Sector'].replace(sector_dict)
    df_copy.drop(columns=['Unnamed: 0', price_var_col],axis=1,inplace=True)
    float_columns=list(df_copy.select_dtypes(include=['float64']).columns)
    float_columns.remove('Sector')
    for i in float_columns:
        df_copy[i].fillna(df_copy[i].median(), inplace=True)

    df_for_loging = df_copy.drop(columns=['Class'])
    
    df_copy_pos = df_for_loging-df_for_loging.min()
    
    df_copy_pos_logged = np.log1p(df_copy_pos)

    df_copy_pos_logged['Class'] = df_copy['Class']
    
    # df_copy[float_columns] = df_copy[float_columns][(df_copy[float_columns]>df_copy[float_columns].quantile(0.01)) & (df_copy[float_columns]<df_copy[float_columns].quantile(0.99))]
    # scaler = RobustScaler(quantile_range=(0.01,0.99))
    scaler = RobustScaler()
    df2 = df_copy_pos_logged
    df2[float_columns] = scaler.fit_transform(df2[float_columns])
    
    lower = df2[float_columns].quantile(0.01)
    greater = df2[float_columns].quantile(0.99)
    df2[float_columns] = (df2[float_columns])[(df2[float_columns] >= lower) & (df2[float_columns] <= greater)]

    return df2.dropna()

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))
data2014_pp_rb

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
data2014_pp_rb.columns[:10]

In [None]:
sns.pairplot(data2014_pp_rb[list(data2014_pp_rb.columns[:10]) + ['Class']], hue='Class');

### LightGBM

In [None]:
import lightgbm as lgb
import re

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))


data2014_pp_rb = data2014_pp_rb.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)

train_data = lgb.Dataset(X_train, label=Y_train)


# Set hyperparameters for the LightGBM model
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 100,
    'learning_rate': 0.08,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
    'n_estimators': 40
}

# Train the LightGBM classifier
model = lgb.train(params, train_data, num_boost_round=100)

# Make predictions on the test set
y_pred = model.predict(X_test)
predict = [1 if pred >= 0.5 else 0 for pred in y_pred]

# Evaluate the model
accuracy = accuracy_score(Y_test, predict)
print('Accuracy:', accuracy)

print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
params = {
    'application': 'binary', # for binary classification
#     'num_class' : 1, # used for multi-classes
    'boosting': 'gbdt', # traditional gradient boosting decision tree
    'num_iterations': 100, 
    'learning_rate': 0.05,
    'num_leaves': 62,
    'device': 'gpu', # you can use GPU to achieve faster learning
    'max_depth': -1, # <0 means no limit
    'max_bin': 510, # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': 5, # L1 regularization
    'lambda_l2': 10, # L2 regularization
    'metric' : 'binary_error',
    'subsample_for_bin': 200, # number of samples for constructing bins
    'subsample': 1, # subsample ratio of the training instance
    'colsample_bytree': 0.8, # subsample ratio of columns when constructing the tree
    'min_split_gain': 0.5, # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': 5# minimum number of data needed in a leaf
}

# Initiate classifier to use
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', 
          objective = 'binary', 
          n_jobs = 5, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'])

gridParams = {
    'learning_rate': [0.005, 0.01],
    'n_estimators': [8,16,24],
    'num_leaves': [6,8,12,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
    'objective' : ['binary'],
    'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    'random_state' : [500],
    'colsample_bytree' : [0.64, 0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

grid = GridSearchCV(mdl, gridParams, verbose=1, cv=4, n_jobs=-1)
# Run the grid
grid.fit(X_train, Y_train)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))


data2014_pp_rb = data2014_pp_rb.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)

train_data = lgb.Dataset(X_train, label=Y_train)

# Train the LightGBM classifier
model = lgb.train(grid, train_data, num_boost_round=100)

# Make predictions on the test set
y_pred = model.predict(X_test)
predict = [1 if pred >= 0.5 else 0 for pred in y_pred]

# Evaluate the model
accuracy = accuracy_score(Y_test, predict)
print('Accuracy:', accuracy)

print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

### Как ещё улучшить данные?

In [None]:
def replace_outliers_with_median(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = (column < lower_bound) | (column > upper_bound)
    column[outliers] = column.median()  # Replace outliers with column median

    return column

def preprocess_cf2_no_growth_rb_log_replace(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    not_growth_columns=list(filter(lambda x: "growth" not in x.lower(),list(df.columns)))

    df_no_growth = df[not_growth_columns]
    
    sector_dict = {}
    tmp = df_no_growth[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df_copy = df_no_growth
    df_copy['Sector'] = df_copy['Sector'].replace(sector_dict)
    df_copy.drop(columns=['Unnamed: 0', price_var_col],axis=1,inplace=True)
    float_columns=list(df_copy.select_dtypes(include=['float64']).columns)
    float_columns.remove('Sector')
    
    
#     for i in float_columns:
#         df_copy[i].fillna(df_copy[i].median(), inplace=True)

    imputer = KNNImputer(n_neighbors=20, weights='distance', metric='nan_euclidean', copy=True)
    df_copy_clean = imputer.fit_transform(df_copy)
    df_copy_clean = pd.DataFrame(df_copy_clean)
    df_copy_clean.columns = list(df_copy)

    df_for_loging = df_copy_clean.drop(columns=['Class'])
    
    df_copy_pos = df_for_loging-df_for_loging.min()
    
    df_copy_pos_logged = np.log1p(df_copy_pos)

    df_copy_pos_logged['Class'] = df_copy_clean['Class']
    
    # df_copy[float_columns] = df_copy[float_columns][(df_copy[float_columns]>df_copy[float_columns].quantile(0.01)) & (df_copy[float_columns]<df_copy[float_columns].quantile(0.99))]
    # scaler = RobustScaler(quantile_range=(0.01,0.99))
    scaler = RobustScaler()
    df2 = df_copy_pos_logged
    df2[float_columns] = scaler.fit_transform(df2[float_columns])
    
#     lower = df2[float_columns].quantile(0.01)
#     greater = df2[float_columns].quantile(0.99)
    
#     df2[float_columns] = (df2[float_columns])[(df2[float_columns] >= lower) & (df2[float_columns] <= greater)]

    df2[float_columns] = (df2[float_columns]).apply(replace_outliers_with_median)

    return df2.dropna()

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
data2018_pp_rb = preprocess_cf2_no_growth_rb_log(data2018, get_price_var_col(2018))
y=data2018_pp_rb.dropna()['Class']
X=data2018_pp_rb.dropna().drop(columns=['Class'])
predict=model.predict(X)
print(confusion_matrix(y, predict))
print(roc_auc_score(y,predict))
fpr, tpr, _ = roc_curve(y,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

## PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])


In [None]:
# Perform PCA
pca = PCA(n_components=len(list(X.columns)))
pca.fit(X)

# Calculate explained variance
explained_variance = pca.explained_variance_ratio_

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(explained_variance)

# Plot cumulative explained variance
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance vs. Number of Components')
plt.grid(True)
plt.show()

In [None]:
data2014copy=data2014.copy()

In [None]:
data2014copy.drop(columns=['Unnamed: 0', '2015 PRICE VAR [%]', 'Class'], inplace=True)

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))
y=data2014_pp_rb['Class']
df_pca = data2014_pp_rb.drop(columns=['Class'])

In [None]:
# Create a PCA instance
pca = PCA(n_components=20)  # Specify the number of components you want to retain

# Apply PCA to the scaled data
pca_result = pca.fit_transform(df_pca)

# Create a new dataframe to store the PCA results
df_pca_result = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20'])

# Concatenate the PCA results with the original dataframe
#df_final = pd.concat([data2014_pp_rb, df_pca_result], axis=1)

# Print the explained variance ratio
print("Explained Variance Ratio:", pca.explained_variance_ratio_)

# Print the principal components' loadings or coefficients
print("Principal Components' Loadings:")
print(pca.components_)

# Identify columns with high loadings
threshold = 0.5  # Set a threshold for determining high loadings
high_loading_columns = []
for i, column in enumerate(df_pca.columns):
    loading = pca.components_[0, i]  # Use the first principal component (PC1) for simplicity
    if abs(loading) >= threshold:
        high_loading_columns.append(column)

print("Columns with high loadings:")
print(high_loading_columns)

In [None]:
#Calculate explained variance
explained_variance = pca.explained_variance_ratio_

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(explained_variance)

# Plot cumulative explained variance
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance vs. Number of Components')
plt.grid(True)
plt.show()

In [None]:
np.sum(pca.explained_variance_ratio_)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df_pca_result, y, test_size=0.3,random_state=17)
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

## Убираем потенциально ненужные колонки

In [None]:
list(data2014.iloc[1])

In [None]:
list(filter(lambda x: "(per Share)" in x, list(data2014.columns)))

In [None]:
def preprocess_cf2_no_growth_rb_log_new(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    #not_growth_columns=list(filter(lambda x: "growth" not in x.lower(),list(df.columns)))

    #not_growth_columns=list(filter(lambda x: "(per Share)" not in x, list(df.columns)))
    df_no_growth = df#[not_growth_columns]
    
    sector_dict = {}
    tmp = df_no_growth[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df_copy = df_no_growth
    df_copy['Sector'] = df_copy['Sector'].replace(sector_dict)
    df_copy.drop(columns=['Unnamed: 0', price_var_col], axis=1, inplace=True)
    float_columns=list(df_copy.select_dtypes(include=['float64']).columns)
    float_columns.remove('Sector')
    for i in float_columns:
        df_copy[i].fillna(df_copy[i].median(), inplace=True)

    df_for_loging = df_copy.drop(columns=['Class'])
    
    df_copy_pos = df_for_loging-df_for_loging.min()
    
    df_copy_pos_logged = np.log1p(df_copy_pos)

    df_copy_pos_logged['Class'] = df_copy['Class']
    
    # df_copy[float_columns] = df_copy[float_columns][(df_copy[float_columns]>df_copy[float_columns].quantile(0.01)) & (df_copy[float_columns]<df_copy[float_columns].quantile(0.99))]
    # scaler = RobustScaler(quantile_range=(0.01,0.99))
    scaler = RobustScaler()
    df2 = df_copy_pos_logged
    df2[float_columns] = scaler.fit_transform(df2[float_columns])
    
    lower = df2[float_columns].quantile(0.01)
    greater = df2[float_columns].quantile(0.99)
    df2[float_columns] = (df2[float_columns])[(df2[float_columns] >= lower) & (df2[float_columns] <= greater)]

    return df2.dropna()

In [None]:
data2014_pp_rb = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
data2014 = pd.read_csv(r'C:\Учеба\Диплом\2014_Financial_Data.csv')

## Попробуем GridSearch again

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import make_scorer
#creating Scoring parameter: 
scoring = {'accuracy': make_scorer(accuracy_score),
           'precision': make_scorer(precision_score),'recall':make_scorer(recall_score)}

# A sample parameter

parameters = {
    "learning_rate": [0.075, 0.08, 0.085, 0.09, 0.095, 0.1, 0.105, 0.110, 0.115, 0.12, 0.125, 0.13, 0.135, 0.14, 0.145, 0.15, 0.155, 0.16, 0.165, 0.17, 0.175, 0.18, 0.185, 0.19, 0.195, 0.2]
    }
#passing the scoring function in the GridSearchCV
clf = GridSearchCV(GradientBoostingClassifier(), parameters,scoring=scoring,refit=False, n_jobs=-1, verbose=10)

clf.fit(X_train, Y_train)
#converting the clf.cv_results to dataframe
df=pd.DataFrame.from_dict(clf.cv_results_)
#here Possible inputs for cross validation is cv=2, there two split split0 and split1
df[['split0_test_accuracy','split1_test_accuracy','split0_test_precision','split1_test_precision','split0_test_recall','split1_test_recall']]

In [None]:
df_pca_result.columns

In [None]:
columns_to_extract = df_pca_result.columns[1:]

# Create a new DataFrame with the desired columns
df_subset = df_pca_result[columns_to_extract]

In [None]:
df_pca_result.iloc[:,:17].sketch.ask('describe dataset')

## Объединяем данные

In [None]:
data2014 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2014_Financial_Data.csv')
data2015 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2015_Financial_Data.csv')
data2016 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2016_Financial_Data.csv')
data2017 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2017_Financial_Data.csv')
data2018 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2018_Financial_Data.csv')

In [None]:
data2014['Year'] = 2014
data2015['Year'] = 2015
data2016['Year'] = 2016
data2017['Year'] = 2017
data2018['Year'] = 2018

In [None]:
def preprocess_cf2_no_growth_rb_log_year(df: pd.DataFrame, year) -> pd.DataFrame:
    not_growth_columns=list(filter(lambda x: "growth" not in x.lower(),list(df.columns)))

    df = df[not_growth_columns]

    df['Year'] = year   
    
    df.rename(columns={f"{year+1} PRICE VAR [%]": "PRICE_VAR"},inplace=True)
    
    sector_dict = {}
    tmp = df[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df['Sector'] = df['Sector'].replace(sector_dict)
    df.drop(columns=['Unnamed: 0', "PRICE_VAR"], axis=1, inplace=True)
    float_columns=list(df.select_dtypes(include=['float64']).columns)
    float_columns.remove('Sector')
    for i in float_columns:
        df[i].fillna(df[i].median(), inplace=True)

#     df_for_loging = df.drop(columns=['Class'])
   
#     df_copy_pos = df_for_loging-df_for_loging.min()
    
#     df_copy_pos_logged = np.log1p(df_copy_pos)
# 
#     df_copy_pos_logged['Class'] = df['Class']
    
#     df_copy[float_columns] = df_copy[float_columns][(df_copy[float_columns]>df_copy[float_columns].quantile(0.01)) & (df_copy[float_columns]<df_copy[float_columns].quantile(0.99))]
#     scaler = RobustScaler(quantile_range=(0.01,0.99))
    
#     df2 = df_copy_pos_logged

    df2 = df
    
#     scaler = RobustScaler()
#     df2[float_columns] = scaler.fit_transform(df2[float_columns])
    
    lower = df2[float_columns].quantile(0.01)
    greater = df2[float_columns].quantile(0.99)
    df2[float_columns] = (df2[float_columns])[(df2[float_columns] >= lower) & (df2[float_columns] <= greater)]


    return df2.dropna()

In [None]:
data2014 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2014_Financial_Data.csv')
data2015 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2015_Financial_Data.csv')
data2016 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2016_Financial_Data.csv')
data2017 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2017_Financial_Data.csv')
data2018 = pd.read_csv(r'D:\Studies\4 курс\diploma\margo\2018_Financial_Data.csv')

data2014_clean = preprocess_cf2_no_growth_rb_log_year(data2014,2014)
data2015_clean = preprocess_cf2_no_growth_rb_log_year(data2015,2015)
data2016_clean = preprocess_cf2_no_growth_rb_log_year(data2016,2016)
data2017_clean = preprocess_cf2_no_growth_rb_log_year(data2017,2017)
data2018_clean = preprocess_cf2_no_growth_rb_log_year(data2018,2018)

In [None]:
train_data = pd.concat([data2014_clean, data2015_clean, data2016_clean])

In [None]:
Y_train = train_data['Class']
X_train = train_data.drop(columns=['Class'])
scaler = RobustScaler()
X_train_sc = scaler.fit_transform(X_train)

In [None]:
model=GradientBoostingClassifier()
model.fit(X_train_sc, Y_train)

In [None]:
test_data = pd.concat([data2017_clean, data2018_clean])

In [None]:
Y_test = test_data['Class']
X_test = test_data.drop(columns=['Class'])
X_test_sc = scaler.transform(X_test)

In [None]:
predict=model.predict(X_test_sc)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

## TSNE

In [None]:
n=5

tsne = TSNE(n_components=n, random_state=42,)

data2014_pp_rb = preprocess_cf2_no_growth_rb_log(data2014, get_price_var_col(2014))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)



# Perform t-SNE on the DataFrame
tsne_result = tsne.fit_transform(X_train)

# Create a new DataFrame with the t-SNE results
tsne_df = pd.DataFrame(tsne_result, columns=['tsne_' + str(i) for i in range(n)])

In [None]:
tsne_result_test = tsne.fit_transform(X_test)

# Create a new DataFrame with the t-SNE results
tsne_df_test = pd.DataFrame(tsne_result_test, columns=['tsne_' + str(i) for i in range(n)])

In [None]:
model=GradientBoostingClassifier()
model.fit(tsne_df, Y_train)
predict=model.predict(tsne_df_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))

### svd

In [None]:
data2014.columns

In [None]:
def preprocess_cf2_no_growth_rb_log(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    not_growth_columns=list(filter(lambda x: "growth" not in x.lower(),list(df.columns)))

    df_no_growth = df[not_growth_columns]
    
    sector_dict = {}
    tmp = df_no_growth[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df_copy = df_no_growth
    df_copy['Sector'] = df_copy['Sector'].replace(sector_dict)
    df_copy.drop(columns=['Unnamed: 0', price_var_col],axis=1,inplace=True)
    float_columns=list(df_copy.select_dtypes(include=['float64']).columns)
    float_columns.remove('Sector')
    

    df_for_loging = df_copy.drop(columns=['Class'])
    
    df_copy_pos = df_for_loging-df_for_loging.min()
    
    df_copy_pos_logged = np.log1p(df_copy_pos)

    df_copy_pos_logged['Class'] = df_copy['Class']
    
    # df_copy[float_columns] = df_copy[float_columns][(df_copy[float_columns]>df_copy[float_columns].quantile(0.01)) & (df_copy[float_columns]<df_copy[float_columns].quantile(0.99))]
    # scaler = RobustScaler(quantile_range=(0.01,0.99))
    scaler = RobustScaler()
    df2 = df_copy_pos_logged
    df2[float_columns] = scaler.fit_transform(df2[float_columns])
    
    lower = df2[float_columns].quantile(0.01)
    greater = df2[float_columns].quantile(0.99)
    df2[float_columns] = (df2[float_columns])[(df2[float_columns] >= lower) & (df2[float_columns] <= greater)]

    return df2.dropna()

In [None]:
from sklearn.decomposition import TruncatedSVD

# Assuming you have a matrix or DataFrame named "data"

# Create an instance of TruncatedSVD with the desired number of components
svd = TruncatedSVD(n_components=4, random_state=42)


data2014_2 = data2014.drop(columns=['Unnamed: 0','Sector', '2015 PRICE VAR [%]', 'Class'])

# svd = TruncatedSVD(n_components=len(list(data2014_2.columns)), random_state=42)


# Apply SVD to the data
for i in data2014_2.columns:
    data2014_2[i].fillna(data2014_2[i].median(), inplace=True)


svd_result = svd.fit_transform(data2014_2)

# The transformed data will have reduced dimensions based on the specified number of components
print(list(svd.explained_variance_ratio_))

### оставить только Growth

In [None]:
g_cols=list(filter(lambda x: "growth" in x.lower(), list(data2014.columns)))

In [None]:
def preprocess_cf2_with_growth_rb_log(df: pd.DataFrame,price_var_col) -> pd.DataFrame:
    growth_columns=list(filter(lambda x: "ratio" in x.lower() or "growth" in x.lower() or x in ["Sector","Class"], list(df.columns)))

    df_with_growth = df[growth_columns]
    
    sector_dict = {}
    tmp = df_with_growth[['Sector', 'Class']]
    for i in tmp['Sector'].drop_duplicates():
        a=tmp[tmp['Sector']==i].value_counts().sort_index().values
        sector_dict[i] = a[1]/sum(a)
    df_copy = df_with_growth
    df_copy['Sector'] = df_copy['Sector'].replace(sector_dict)
    # df_copy.drop(columns=['Unnamed: 0', price_var_col],axis=1,inplace=True)
    # float_columns=list(df_copy.select_dtypes(include=['float64']).columns)
    # float_columns.remove('Sector')
    for i in df_copy.columns:
        df_copy[i].fillna(df_copy[i].median(), inplace=True)
    

    df_for_loging = df_copy.drop(columns=['Class'])
    
    df_copy_pos = df_for_loging-df_for_loging.min()
    
    df_copy_pos_logged = np.log1p(df_copy_pos)

    df_copy_pos_logged['Class'] = df_copy['Class']
    
    # df_copy[float_columns] = df_copy[float_columns][(df_copy[float_columns]>df_copy[float_columns].quantile(0.01)) & (df_copy[float_columns]<df_copy[float_columns].quantile(0.99))]
    # scaler = RobustScaler(quantile_range=(0.01,0.99))
    scaler = RobustScaler()
    df2 = df_copy_pos_logged
    df2 = pd.DataFrame(scaler.fit_transform(df2),columns=df2.columns)
    
    lower = df2.quantile(0.01)
    greater = df2.quantile(0.99)
    df2 = (df2)[(df2 >= lower) & (df2 <= greater)]

    return df2.dropna()

In [None]:
data2014_pp_rb = preprocess_cf2_with_growth_rb_log(data2014, get_price_var_col(2014))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)
model=GradientBoostingClassifier()
model.fit(X_train, Y_train)
predict=model.predict(X_test)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()

In [None]:
pca = PCA()
pca.fit(X_train)

# Calculate explained variance
explained_variance = pca.explained_variance_ratio_

# Calculate cumulative explained variance
cumulative_variance = np.cumsum(list(reversed(sorted(explained_variance))))

# Plot cumulative explained variance
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance vs. Number of Components')
plt.grid(True)
plt.show()

In [None]:
qq=pca.fit(X_train)

In [None]:

n=10



data2014_pp_rb = preprocess_cf2_with_growth_rb_log(data2014, get_price_var_col(2014))
y=data2014_pp_rb.dropna()['Class']
X=data2014_pp_rb.dropna().drop(columns=['Class'])
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3,random_state=17)

pca = PCA(n_components=6)
X_train_pca = pd.DataFrame(pca.fit_transform(X_train), columns=pca.get_feature_names_out())
X_test_pca = pd.DataFrame(pca.transform(X_test), columns=pca.get_feature_names_out())

model=GradientBoostingClassifier()
model.fit(X_train_pca, Y_train)
predict=model.predict(X_test_pca)
print(confusion_matrix(Y_test, predict))
print(roc_auc_score(Y_test,predict))
fpr, tpr, _ = roc_curve(Y_test,predict)
roc_auc = auc(fpr,tpr)
RocCurveDisplay(fpr=fpr,tpr=tpr,roc_auc=roc_auc).plot()



