In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

#added to support training and testing the model

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
"""
import tensorflow as tf
import keras
from keras import layers
from keras import models
"""

from keras.models import Sequential
from keras.layers import Dense

import pandas as pd

data = pd.read_csv('./pandas_demo_data.csv')

new_df = data.copy()

account_totals = new_df.groupby("Account")["Amount"].sum().reset_index()
account_totals.columns = ["Account", "TotalSpend"]

new_df = new_df.merge(account_totals, on="Account", how="left")

numerical_cols = ["Amount"]
categorical_cols = ["Supplier","Supplier3","GL Unit","Account","Alt Acct","Dept"]
date_cols = ["Date","Acctg Date"]
drop_cols = ["Descr","Unit", "Voucher", "Invoice", "Project", "Location", "Status", "Status2", "WL Acct", "Affiliate", "Profile ID", "Item", "PO No.", "Receipt No", "Recv Date", "Interim Project"]

#new_df = new_df.drop(columns=drop_cols) #switch to new dataframe set
new_df = new_df.drop(columns=[col for col in drop_cols if col in new_df.columns])

class  DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = pd.to_datetime(X[col], errors='coerce')
            X[f"{col}_year"] = X[col].dt.year
            X[f"{col}_month"] = X[col].dt.month
            X[f"{col}_day"] = X[col].dt.day
            X = X.drop(columns=[col])
        return X
            

new_df = DateFeatureExtractor(columns=date_cols).fit_transform(new_df)

print("DF is now", type(new_df))
        

#new_df.info()

#inputs = new_df.loc[0:321]
#print(inputs)

numerical_cols += [f"{col}_{part}" for col in date_cols for part in ["year", "month", "day"]]

#print(numerical_cols)

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler"), StandardScaler()
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot"), OneHotEncoder(handle_unknown="ignore")
])

print("Text columns:", text_cols)
print("df columns:", df.columns.tolist())

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols),
])

print("df columns:", new_df.columns.tolist())
print("text column used:", new_df.columns.
print("First few values of text column:", new_df[text_cols[0]].head())
print("Any NaNs in text column?", new_df[text_cols[0]].isna().any())
#transform the dataframe into a tensor data type for the model
X_ready = preprocessor.fit_transform(new_df)

print(X_ready)

#This is the target variable that will be predicted
y = new_df["TotalSpend"]

# Prep data for training and testing with model
X_train, X_test, y_train, y_test = train_test_split(X_ready, y, test_size=0.2, random_state=42)

#convert to dense format....I dont know why this is needed!!!!
if hasattr(X_train, "toarray"):
    X_train = X_train.toarray()
    X_test = X_test.toarray()
    
#Build and train the neural network (model)
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1])),
    Dense(64, activation='relu'),
    Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared error')
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2, verbose=1)





SyntaxError: invalid syntax (3188569175.py, line 94)

In [14]:
new_df.head()




Unnamed: 0,Supplier,Supplier3,GL Unit,Account,Alt Acct,Dept,Amount,TotalSpend,Date_year,Date_month,Date_day,Acctg Date_year,Acctg Date_month,Acctg Date_day
0,7031,DartPoints Operating Company,10001,6520671,9999,4610,19524.75,330251.28,2025,1,1,2025,1,3
1,5303,George Duncan,10001,6521205,9999,4610,72.39,70039.07,2025,1,6,2025,1,7
2,5844,Amazon Capital Services Inc,10001,6520662,9999,4610,816.95,19774.85,2025,1,2,2025,1,6
3,5844,Amazon Capital Services Inc,10001,6520662,9999,4610,3749.76,19774.85,2025,1,7,2025,1,8
4,5597,JCMR Technology Inc,10001,6520671,9999,4610,3693.69,330251.28,2025,1,6,2025,1,7
