In [54]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [55]:
# Loading data
# https://towardsdatascience.com/4-tricks-you-should-know-to-parse-date-columns-with-pandas-read-csv-27355bb2ad0e#:~:text=By%20default%2C%20date%20columns%20are%20parsed%20using%20the,a%20different%20date%20format%2C%20for%20example%2C%20YYYY-DD-MM%20HH%3AMM%3ASS%3A
file_path = Path("/Users/Irei/Desktop/Desktop/Analysis_Projects/Final_Project_10_UTA/Final_Project_10_UTA/AAPL_mock_data.csv")
apple_df = pd.read_csv(file_path, parse_dates=['Date'])
apple_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Season_spring,Season_summer,Season_fall,Season_winter,quarter_1,quarter_2,quarter_3,quarter_4,Gain/Loss
0,2020-12-31,134.080002,134.740005,131.720001,132.690002,132.267349,99116600,0,0,0,1,0,0,0,1.0,Gain
1,2019-12-31,72.482498,73.419998,72.379997,73.412498,72.552094,100805600,0,0,0,1,0,0,0,1.0,Gain
2,2018-12-31,39.6325,39.84,39.119999,39.435001,38.39592,140014000,0,0,0,1,0,0,0,1.0,Gain
3,2015-12-31,26.752501,26.7575,26.205,26.315001,24.302439,163649200,0,0,0,1,0,0,0,1.0,Loss
4,2014-12-31,28.205,28.282499,27.5525,27.594999,25.057606,165613600,0,0,0,1,0,0,0,1.0,Gain


In [56]:
apple_df.dtypes

Date             datetime64[ns]
Open                    float64
High                    float64
Low                     float64
Close                   float64
Adj Close               float64
Volume                    int64
Season_spring             int64
Season_summer             int64
Season_fall               int64
Season_winter             int64
quarter_1                 int64
quarter_2                 int64
quarter_3                 int64
quarter_4               float64
Gain/Loss                object
dtype: object

In [65]:
# Convert date into an integer since the system was not taking it as a float
# https://www.geeksforgeeks.org/convert-floats-to-integers-in-a-pandas-dataframe/
apple_df['Date'] = apple_df['Date'].astype(int)

In [66]:
# Define features set
X = apple_df.copy()
X = X.drop("Gain/Loss", axis=1)
X.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Season_spring,Season_summer,Season_fall,Season_winter,quarter_1,quarter_2,quarter_3,quarter_4
0,1609372800000000000,134.080002,134.740005,131.720001,132.690002,132.267349,99116600,0,0,0,1,0,0,0,1.0
1,1577750400000000000,72.482498,73.419998,72.379997,73.412498,72.552094,100805600,0,0,0,1,0,0,0,1.0
2,1546214400000000000,39.6325,39.84,39.119999,39.435001,38.39592,140014000,0,0,0,1,0,0,0,1.0
3,1451520000000000000,26.752501,26.7575,26.205,26.315001,24.302439,163649200,0,0,0,1,0,0,0,1.0
4,1419984000000000000,28.205,28.282499,27.5525,27.594999,25.057606,165613600,0,0,0,1,0,0,0,1.0


In [67]:
# Define target vector
y = df_loans["Gain/Loss"].values.reshape(-1, 1)
y[:5]

array([['Gain'],
       ['Gain'],
       ['Gain'],
       ['Loss'],
       ['Gain']], dtype=object)

In [68]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [69]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(14, 15)
(5, 15)
(14, 1)
(5, 1)


In [70]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [71]:
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(15, 15)
(4, 15)
(15, 1)
(4, 1)


In [72]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [73]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [74]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [75]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [76]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [77]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)
predictions

array(['Loss', 'Loss', 'Loss', 'Gain', 'Loss'], dtype=object)

In [78]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [79]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1,4
Actual 1,0,0


Accuracy Score : 0.2
Classification Report
              precision    recall  f1-score   support

        Gain       1.00      0.20      0.33         5
        Loss       0.00      0.00      0.00         0

    accuracy                           0.20         5
   macro avg       0.50      0.10      0.17         5
weighted avg       1.00      0.20      0.33         5



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
