In [17]:
import numpy as np
import pandas as pd
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import plot_tree, DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

In [2]:
df = pd.read_csv('merged-frame.csv')
print(df.dtypes)
df.head()

order_id             object
order_date           object
status               object
item_id             float64
qty_ordered         float64
price               float64
value               float64
county               object
payment_method       object
category             object
gender               object
age                 float64
user name            object
state                object
discount_percent    float64
year                  int64
month                 int64
actual_x            float64
forcast_x           float64
previous_x          float64
actual_y            float64
forcast_y           float64
previous_y          float64
dtype: object


  df = pd.read_csv('merged-frame.csv')


Unnamed: 0,order_id,order_date,status,item_id,qty_ordered,price,value,county,payment_method,category,...,state,discount_percent,year,month,actual_x,forcast_x,previous_x,actual_y,forcast_y,previous_y
0,100354678,2020-10-01 00:00:00+00:00,received,574772.0,21.0,89.9,1798.0,Harmon,cod,Men's Fashion,...,OK,0.0,2020,10,0.0,0.1,0.2,0.0,0.1,0.2
1,100354678,2020-10-01 00:00:00+00:00,received,574774.0,11.0,19.0,190.0,Harmon,cod,Men's Fashion,...,OK,0.0,2020,10,0.0,0.1,0.2,0.0,0.1,0.2
2,100354680,2020-10-01 00:00:00+00:00,complete,574777.0,9.0,149.9,1199.2,Harmon,cod,Men's Fashion,...,OK,0.0,2020,10,0.0,0.1,0.2,0.0,0.1,0.2
3,100354680,2020-10-01 00:00:00+00:00,complete,574779.0,9.0,79.9,639.2,Harmon,cod,Men's Fashion,...,OK,0.0,2020,10,0.0,0.1,0.2,0.0,0.1,0.2
4,100354677,2020-10-01 00:00:00+00:00,canceled,574769.0,2.0,49.0,49.0,Bradford,Payaxis,Mobiles & Tablets,...,FL,0.0,2020,10,0.0,0.1,0.2,0.0,0.1,0.2


In [3]:
#Build labeled data for the response variable
#bullish - bearish - critical
labels = ['bullish' if actual > forcast else 'bearish' if actual < forcast else 'critical' for actual, forcast in zip(df['actual_x'], df['forcast_x']) ]
print(len(labels))
df['label'] = pd.Series(np.array(labels))
df.drop(columns=['user name','actual_y','forcast_y','previous_y', 'order_id','actual_x','forcast_x','previous_x','item_id'], axis=1, inplace=True)
df['order_date'] = pd.to_datetime(df['order_date'])
print(df.dtypes)
df.head()


286392
order_date          datetime64[ns, UTC]
status                           object
qty_ordered                     float64
price                           float64
value                           float64
county                           object
payment_method                   object
category                         object
gender                           object
age                             float64
state                            object
discount_percent                float64
year                              int64
month                             int64
label                            object
dtype: object


Unnamed: 0,order_date,status,qty_ordered,price,value,county,payment_method,category,gender,age,state,discount_percent,year,month,label
0,2020-10-01 00:00:00+00:00,received,21.0,89.9,1798.0,Harmon,cod,Men's Fashion,F,43.0,OK,0.0,2020,10,bearish
1,2020-10-01 00:00:00+00:00,received,11.0,19.0,190.0,Harmon,cod,Men's Fashion,F,43.0,OK,0.0,2020,10,bearish
2,2020-10-01 00:00:00+00:00,complete,9.0,149.9,1199.2,Harmon,cod,Men's Fashion,F,43.0,OK,0.0,2020,10,bearish
3,2020-10-01 00:00:00+00:00,complete,9.0,79.9,639.2,Harmon,cod,Men's Fashion,F,43.0,OK,0.0,2020,10,bearish
4,2020-10-01 00:00:00+00:00,canceled,2.0,49.0,49.0,Bradford,Payaxis,Mobiles & Tablets,M,28.0,FL,0.0,2020,10,bearish


In [4]:
## ONE HOT ENCODING
df = pd.get_dummies(df, columns=['status', 'county', 'payment_method', 'category','gender', 'state'])

In [19]:
## building the model

pipeline = Pipeline([
    ('normalizer', StandardScaler()), #Step1 - normalize data
    ('clf', DecisionTreeClassifier(max_depth=100)) #step2 - classifier
])
print(pipeline.steps)

df1 = df.drop(['order_date','year','month'], axis=1)

#Seperate train and test data
X_train, X_test, y_train, y_test = train_test_split(df1.drop('label', axis=1),df1['label'],test_size = 0.4,random_state = 10)

# Fit the decision tree classifier
#clf = DecisionTreeClassifier()
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='micro')
cm = confusion_matrix(y_test, y_pred, labels=["bullish","critical","bearish"])
print(f'Test accuracy: {accuracy:.5f}')
print(f'Test F1: {f1:.5f}')
print(cm)




[('normalizer', StandardScaler()), ('clf', DecisionTreeClassifier(max_depth=100))]
Test accuracy: 0.80785
Test F1: 0.80785
[[46979  7474  1624]
 [ 7478 41378  1186]
 [ 2614  1636  4188]]


In [11]:
# Define the parameter grid
param_grid = {
    'clf__criterion': ["gini", "entropy", "log_loss"],
    'clf__min_samples_split': [2, 4, 6]
}

# Create the grid search object
#grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Fit the grid search object to the data
#grid_search.fit(X_train, y_train)

# Print the best combination of parameters
#print(grid_search.best_params_)






In [None]:
# Visualize one of the trees in the forest
# Visualize the decision tree
#plt.figure(figsize=(20, 20))
#plot_tree(clf, feature_names=X_train.columns, class_names=y_train.unique(), filled=True)
#plt.show()

#plot_tree(clf)