In [2]:
import pandas as pd

data = pd.read_csv('./pandas_demo_data.csv')

In [9]:
data.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 321 entries, 0 to 320
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             321 non-null    object 
 1   Supplier         321 non-null    int64  
 2   Status           321 non-null    object 
 3   Status2          321 non-null    object 
 4   Supplier3        321 non-null    object 
 5   Descr            321 non-null    object 
 6   GL Unit          321 non-null    int64  
 7   Account          321 non-null    int64  
 8   Alt Acct         321 non-null    int64  
 9   Dept             321 non-null    int64  
 10  Interim Project  82 non-null     object 
 11  Amount           321 non-null    float64
 12  Acctg Date       321 non-null    object 
dtypes: float64(1), int64(5), object(7)
memory usage: 32.7+ KB


In [19]:
data.head()


Unnamed: 0,Date,Supplier,Status,Status2,Supplier3,Descr,GL Unit,Account,Alt Acct,Dept,Interim Project,Amount,Acctg Date
0,1/1/2025,7031,P,P,DartPoints Operating Company,Software Subscriptions,10001,6520671,9999,4610,,19524.75,1/3/2025
1,1/6/2025,5303,P,P,George Duncan,Postage,10001,6521205,9999,4610,,72.39,1/7/2025
2,1/2/2025,5844,P,P,Amazon Capital Services Inc,Computer Parts and Peripherals,10001,6520662,9999,4610,,816.95,1/6/2025
3,1/7/2025,5844,P,P,Amazon Capital Services Inc,Computer Parts and Peripherals,10001,6520662,9999,4610,,3749.76,1/8/2025
4,1/6/2025,5597,P,P,JCMR Technology Inc,Software Subscriptions,10001,6520671,9999,4610,,3693.69,1/7/2025


In [5]:
data.drop(columns=['Unit', 'Voucher', 'Invoice', 'Project', 'Location', 'WL Acct', 'Affiliate', 'Profile ID'], inplace=True)


In [8]:
data.drop(columns=['Item', 'PO No.', 'Receipt No', 'Recv Date'], inplace=True)


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

numerical_cols = ["Amount"]
categorical_cols = ["Supplier","Supplier3","GL Unit","Account","Alt Acct","Dept","Interim Project"]
date_cols = ["Date","Acctg Date"]
drop_cols = ["Descr"]

new_df = data.drop(columns=drop_cols) #switch to new dataframe sett

class  DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = pd.to_datetime(X[col], errors='coerce')
            X[f"{col}_year"] = X[col].dt.year
            X[f"{col}_month"] = X[col].dt.month
            X[f"{col}_day"] = X[col].dt.day
            X = X.drop(columns=[col])
            
new_df = DateFeatureExtractor(columns=date_cols).fit_transform(new_df)
        

#new_df.info()

#inputs = new_df.loc[0:321]
#print(inputs)

numerical_cols += [f"{col}_{part}" for col in date_cols for part in ["year", "month", "day"]]

#print(numerical_cols)

numeric_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler"), StandardScaler()
])

categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot"), OneHotEncoder(handle_unknown="ignore")
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols),
])

X_ready = preprocessor.fit_transform

print(X_ready)


<bound method ColumnTransformer.fit_transform of ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer', SimpleImputer()),
                                                 'scaler', StandardScaler()]),
                                 ['Amount', 'Date_year', 'Date_month',
                                  'Date_day', 'Acctg Date_year',
                                  'Acctg Date_month', 'Acctg Date_day']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 'onehot',
                                                 OneHotEncoder(handle_unknown='ignore')]),
                                 ['Supplier', 'Supplier3', 'GL Unit', 'Account',
                                  'Alt Acct', 'Dept', 'Interim Project'])])>
