In [1]:
import numpy as np
import pandas as pd
from hazm import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, accuracy_score, precision_score

pd.options.mode.chained_assignment = None

In [2]:
data_df = pd.read_csv("cleaned_daily_all.csv")

data_df = data_df.reset_index()
data_df.dropna(axis = 0, inplace = True)
data_df.rename(columns = {'Unnamed: 0':'Order'}, inplace = True)
data_df.replace({-np.inf: -1000000, np.inf: 1000000}, inplace = True)

data_df

Unnamed: 0,index,Order,date,max_price,min_price,close_price,last_price,first_price,yesterday_price,value,...,NonIndividual_buy_value,Individual_sell_value,NonIndividual_sell_value,adj_max_price,adj_min_price,adj_first_price,adj_last_price,adj_volume,adj_close_price,stock_name
193189,193189,966,2008-11-26,5611.0,5474.0,5611.0,5611.0,5474.0,5643.0,2.510883e+08,...,5.611000e+04,2.510322e+08,5.611000e+04,20.0,20.0,20.0,20.0,45869.0,20.0,بترانس
193190,193190,355,2008-11-26,2112.0,2055.0,2055.0,2055.0,2112.0,2051.0,1.058055e+07,...,2.055000e+04,1.056000e+07,2.055000e+04,78.0,76.0,78.0,76.0,5010.0,76.0,بسویچ
193191,193191,414,2008-11-26,2998.0,2909.0,2998.0,2998.0,2909.0,2998.0,6.117800e+05,...,6.117800e+05,5.818000e+05,2.998000e+04,496.0,482.0,482.0,496.0,210.0,496.0,بشهاب
193192,193192,1376,2008-11-26,1789.0,1740.0,1789.0,1789.0,1740.0,1793.0,1.186729e+07,...,1.789000e+04,1.184940e+07,1.789000e+04,157.0,153.0,153.0,157.0,6820.0,157.0,بموتو
193193,193193,882,2008-11-26,2283.0,2250.0,2281.0,2281.0,2250.0,2217.0,7.077851e+08,...,9.976936e+07,6.621023e+08,4.568281e+07,149.0,147.0,147.0,149.0,310212.0,149.0,بنیرو
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1157760,1157760,120,2021-03-13,33511.0,33511.0,34185.0,33511.0,33511.0,34194.0,6.344303e+08,...,0.000000e+00,6.344303e+08,0.000000e+00,33511.0,33511.0,33511.0,33511.0,18932.0,34185.0,گدنا
1157761,1157761,1105,2021-03-13,7257.0,7257.0,7331.0,7257.0,7257.0,7405.0,2.455500e+10,...,2.031960e+10,2.310360e+10,1.451400e+09,7257.0,7257.0,7257.0,7257.0,3383630.0,7331.0,گوهران
1157762,1157762,866,2021-03-13,204652.0,204652.0,208808.0,204652.0,204652.0,208828.0,9.291201e+07,...,0.000000e+00,9.291201e+07,0.000000e+00,204652.0,204652.0,204652.0,204652.0,454.0,208808.0,گپارس
1157763,1157763,818,2021-03-13,2490.0,2414.0,2437.0,2421.0,2420.0,2463.0,5.225108e+10,...,1.378256e+10,5.171634e+10,5.347487e+08,2490.0,2414.0,2420.0,2421.0,21438679.0,2437.0,گکوثر


In [3]:
data_df['stock_name'].value_counts()

وصنعت     2723
سفارس     2716
وتوصا     2709
وبهمن     2701
وبوعلی    2687
          ... 
قاروم       20
فجوش        20
اکالا       13
شگامرن       7
نکالا        4
Name: stock_name, Length: 619, dtype: int64

In [4]:
stock_df = data_df.loc[data_df['stock_name'] == 'وصنعت']
# stock_df

In [5]:
#Calculate the power of buyer and seller
stock_df['pw_of_individual_buyer'] = stock_df['Individual_buy_volume'] / stock_df['Individual_buy_count']
stock_df['pw_of_individual_seller'] = stock_df['Individual_sell_volume'] / stock_df['Individual_sell_count']
stock_df['pw_of_nonIndividual_buyer'] = stock_df['NonIndividual_buy_volume'] / stock_df['NonIndividual_buy_count']
stock_df['pw_of_nonIndividual_seller'] = stock_df['NonIndividual_sell_volume'] / stock_df['NonIndividual_sell_count']

#Replace Nan values that exist because pf divide by zero 
stock_df['pw_of_individual_buyer'] = stock_df['pw_of_individual_buyer'].fillna(0)
stock_df['pw_of_individual_seller'] = stock_df['pw_of_individual_seller'].fillna(0)
stock_df['pw_of_nonIndividual_buyer'] = stock_df['pw_of_nonIndividual_buyer'].fillna(0)
stock_df['pw_of_nonIndividual_seller'] = stock_df['pw_of_nonIndividual_seller'].fillna(0)

#Check if power of buyer is more than seller or not
stock_df['individual_power'] = list((stock_df['pw_of_individual_buyer'] > stock_df['pw_of_individual_seller']).astype(int)) 
stock_df['nonIndividual_power'] = list((stock_df['pw_of_nonIndividual_buyer'] > stock_df['pw_of_nonIndividual_seller'] ).astype(int))

#Calculate the percentage of price changes compared to yesterday
stock_df['last_price_changes'] = 100*(stock_df['close_price']-stock_df['yesterday_price'])/stock_df['yesterday_price']
# stock_df['last_pcp'] = list((stock_df['last_price_changes'] >= 1.1).astype(int)) #If the changes are more than 1.1 (Wage) it's valuable.

In [6]:
current_price = list(stock_df['adj_close_price'])
one_daysـago = [0] + current_price[: len(current_price) - 1]
two_daysـago = [0, 0] + current_price[: len(current_price) - 2]
three_daysـago = [0, 0, 0] + current_price[: len(current_price) - 3]
one_weeksـago = [0, 0, 0, 0, 0, 0, 0] + current_price[: len(current_price) - 7]

stock_df['1days_ago_price'] = one_daysـago
stock_df['2days_ago_price'] = two_daysـago
stock_df['3days_ago_price'] = three_daysـago
stock_df['1weeks_ago_price'] = one_weeksـago

In [7]:
#Labeling
tomorrow_price = list(stock_df['adj_close_price'])
tomorrow_price.pop(0)
tomorrow_price.append(tomorrow_price[len(tomorrow_price) - 1])

stock_df['tomorrow_price'] = tomorrow_price
stock_df['next_pcp'] = list((stock_df['tomorrow_price'] > stock_df['adj_close_price']).astype(int))

stock_df = stock_df.iloc[7: , :]
stock_df.drop(stock_df.tail(1).index, inplace = True)
# stock_df

In [8]:
y = list((stock_df['next_pcp'] == 1).astype(int))
y = np.nan_to_num(y)

stock_df.drop('date', inplace = True, axis = 1)
stock_df.drop('Order', inplace = True, axis = 1)
stock_df.drop('index', inplace = True, axis = 1)
stock_df.drop('volume', inplace = True, axis = 1)
stock_df.drop('max_price', inplace = True, axis = 1)
stock_df.drop('min_price', inplace = True, axis = 1)
stock_df.drop('last_price', inplace = True, axis = 1)
stock_df.drop('stock_name', inplace = True, axis = 1)
stock_df.drop('close_price', inplace = True, axis = 1)
stock_df.drop('first_price', inplace = True, axis = 1)
stock_df.drop('yesterday_price', inplace = True, axis = 1)

stock_df.drop('next_pcp', inplace = True, axis = 1)
stock_df.drop('tomorrow_price', inplace = True, axis = 1)

stock_df.drop('value', inplace = True, axis = 1)
stock_df.drop('Individual_buy_value', inplace = True, axis = 1)
stock_df.drop('NonIndividual_buy_value', inplace = True, axis = 1)
stock_df.drop('Individual_sell_value', inplace = True, axis = 1)
stock_df.drop('NonIndividual_sell_value', inplace = True, axis = 1)


X = stock_df.values
X = np.nan_to_num(X)

stock_df
# stock_df.columns

Unnamed: 0,count,Individual_buy_count,NonIndividual_buy_count,Individual_sell_count,NonIndividual_sell_count,Individual_buy_volume,NonIndividual_buy_volume,Individual_sell_volume,NonIndividual_sell_volume,adj_max_price,...,pw_of_individual_seller,pw_of_nonIndividual_buyer,pw_of_nonIndividual_seller,individual_power,nonIndividual_power,last_price_changes,1days_ago_price,2days_ago_price,3days_ago_price,1weeks_ago_price
194120,3.0,3.0,0.0,2.0,0.0,5819.0,0.0,5819.0,0.0,33.0,...,2909.500000,0.000000,0.0,0,0,0.000000,32.0,32.0,32.0,34.0
194206,15.0,6.0,0.0,4.0,0.0,67000.0,0.0,67000.0,0.0,31.0,...,16750.000000,0.000000,0.0,0,0,-0.194553,32.0,32.0,32.0,33.0
194289,38.0,5.0,2.0,8.0,0.0,128528.0,63390.0,191918.0,0.0,32.0,...,23989.750000,31695.000000,0.0,1,1,-0.389864,32.0,32.0,32.0,33.0
194376,2.0,2.0,0.0,1.0,0.0,6194.0,0.0,6194.0,0.0,31.0,...,6194.000000,0.000000,0.0,0,0,0.000000,32.0,32.0,32.0,33.0
194479,31.0,6.0,0.0,6.0,0.0,137300.0,0.0,137300.0,0.0,33.0,...,22883.333333,0.000000,0.0,0,0,0.391389,32.0,32.0,32.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1154904,62.0,59.0,0.0,2.0,0.0,172627.0,0.0,172627.0,0.0,6950.0,...,86313.500000,0.000000,0.0,0,0,0.000000,7090.0,7140.0,7260.0,7480.0
1155457,146.0,120.0,0.0,10.0,0.0,838973.0,0.0,838973.0,0.0,6950.0,...,83897.300000,0.000000,0.0,0,0,-0.282087,7090.0,7090.0,7140.0,7430.0
1156008,282.0,182.0,0.0,44.0,1.0,1907120.0,0.0,1904922.0,2198.0,6930.0,...,43293.681818,0.000000,2198.0,0,0,-0.565771,7070.0,7090.0,7090.0,7330.0
1156560,1886.0,788.0,4.0,691.0,1.0,11394968.0,1602565.0,12867533.0,130000.0,7180.0,...,18621.610709,400641.250000,130000.0,0,1,-1.422475,7030.0,7070.0,7090.0,7310.0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)

In [10]:
clf = RandomForestClassifier(class_weight = 'balanced_subsample')
clf.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced_subsample')

In [11]:
y_pred = clf.predict(X_test)
print("Classifier performance\n","=" * 20)
print(f"Accuracy score: {clf.score(X_test, y_test): .2f}")
print(f"Precision score: {precision_score(y_test, y_pred): .2f}")
print(f"Recall score: {recall_score(y_test, y_pred): .2f}")
print(f"f1 score: {f1_score(y_test, y_pred): .2f}")

Classifier performance
Accuracy score:  0.48
Precision score:  0.69
Recall score:  0.04
f1 score:  0.07
