In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# Load your dataset
data = pd.read_csv('expenseData.csv')

# Separate features and target
X = data[['date', 'description', 'amount']]
y = data['category']

# Step 1: Feature Engineering

# Extract day of the week and month from 'date'
X['date'] = pd.to_datetime(X['date'])
X['day_of_week'] = X['date'].dt.dayofweek
X['month'] = X['date'].dt.month
X = X.drop(columns=['date'])  # Drop original date column after extracting features

# Step 2: Preprocessing Pipelines

# Text processing pipeline for 'description'
text_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000))  # Adjust max_features as needed
])

# Numerical processing pipeline for 'amount'
num_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Categorical processing pipeline for day_of_week and month
cat_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine all the preprocessing steps
preprocessor = ColumnTransformer([
    ('text', text_pipeline, 'description'),
    ('num', num_pipeline, ['amount']),
    ('cat', cat_pipeline, ['day_of_week', 'month'])
])

# Step 3: Create the final pipeline with classifier

# Try with RandomForestClassifier
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=0))
])

# Step 4: Train-Test Split and Model Training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit the model
model.fit(X_train, y_train)

# Step 5: Evaluate the Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Optional: Save the model if it performs well
import joblib
joblib.dump(model, 'expense_categorization_model.pkl')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['date'] = pd.to_datetime(X['date'])


Accuracy: 1.0
Classification Report:
                 precision    recall  f1-score   support

 Entertainment       1.00      1.00      1.00         3
          Food       1.00      1.00      1.00         7
    Healthcare       1.00      1.00      1.00         5
      Shopping       1.00      1.00      1.00         3
Transportation       1.00      1.00      1.00         9
     Utilities       1.00      1.00      1.00         5

      accuracy                           1.00        32
     macro avg       1.00      1.00      1.00        32
  weighted avg       1.00      1.00      1.00        32



['expense_categorization_model.pkl']