In [None]:
import tensorflow as tf
print(tf.__version__)


2.18.0


In [156]:
import pandas as pd

# Assuming you uploaded a file named "data.csv"
df = pd.read_csv('/Daily Household Transactions.csv')
print(df.head())


                  Date                   Mode        Category  \
0  20/09/2018 12:04:08                   Cash  Transportation   
1  20/09/2018 12:03:15                   Cash            Food   
2           19/09/2018  Saving Bank account 1    subscription   
3  17/09/2018 23:41:17  Saving Bank account 1    subscription   
4  16/09/2018 17:15:08                   Cash       Festivals   

               Subcategory                         Note  Amount  \
0                    Train         2 Place 5 to Place 0    30.0   
1                   snacks  Idli medu Vada mix 2 plates    60.0   
2                  Netflix         1 month subscription   199.0   
3  Mobile Service Provider            Data booster pack    19.0   
4             Ganesh Pujan                  Ganesh idol   251.0   

  Income/Expense Currency  
0        Expense      INR  
1        Expense      INR  
2        Expense      INR  
3        Expense      INR  
4        Expense      INR  


In [157]:
# Fill missing values
df.fillna("Unknown", inplace=True)

# Fill missing numerical values with median
for col in df.select_dtypes(include=["number"]).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Drop any remaining missing values
df.dropna(inplace=True)

print("Missing values handled successfully!")


Missing values handled successfully!


In [158]:
# Convert text to lowercase
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Remove unnecessary spaces and special characters
df.columns = df.columns.str.strip().str.replace("[^a-zA-Z0-9_ ]", "", regex=True)

print("Data cleaned successfully!")


Data cleaned successfully!


In [159]:
df.columns


Index(['Date', 'Mode', 'Category', 'Subcategory', 'Note', 'Amount',
       'IncomeExpense', 'Currency'],
      dtype='object')

In [160]:
# Convert the 'Category' column to lowercase for easy matching
df['Category'] = df['Category'].fillna('').astype(str).str.lower()

# Define keywords for categorization
food_keywords = [
    'food', 'grocery', 'groceries', 'fruits', 'vegetables', 'restaurant',
    'takeout', 'snack', 'breakfast', 'lunch', 'dinner', 'beverages'
]

fixed_expense_keywords = [
    'rent', 'subscription', 'netflix', 'amazon', 'fees', 'recharge',
    'travel', 'bus', 'train', 'metro', 'gas', 'electricity', 'wifi',
    'household', 'utility', 'internet', 'hostel','transport'
]

# Categorization function using only the Category column
def categorize_from_category(category):
    if any(word in category for word in food_keywords):
        return 'Food'
    elif any(word in category for word in fixed_expense_keywords):
        return 'Fixed Expenses'
    else:
        return 'Miscellaneous'

# Apply the function
df['Category'] = df['Category'].apply(categorize_from_category)

# See the result
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Miscellaneous,922
Food,907
Fixed Expenses,632


In [161]:
df.head()

Unnamed: 0,Date,Mode,Category,Subcategory,Note,Amount,IncomeExpense,Currency
0,20/09/2018 12:04:08,cash,Fixed Expenses,train,2 place 5 to place 0,30.0,expense,inr
1,20/09/2018 12:03:15,cash,Food,snacks,idli medu vada mix 2 plates,60.0,expense,inr
2,19/09/2018,saving bank account 1,Fixed Expenses,netflix,1 month subscription,199.0,expense,inr
3,17/09/2018 23:41:17,saving bank account 1,Fixed Expenses,mobile service provider,data booster pack,19.0,expense,inr
4,16/09/2018 17:15:08,cash,Miscellaneous,ganesh pujan,ganesh idol,251.0,expense,inr


In [162]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# Load the DataFrame (assuming already loaded as df)
df = df[['Note', 'Category','Date','Amount','IncomeExpense']].dropna()

# Encode categories
label_encoder = LabelEncoder()
df['CategoryEncoded'] = label_encoder.fit_transform(df['Category'])

# Tokenize text
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['Note'])

sequences = tokenizer.texts_to_sequences(df['Note'])
padded_sequences = pad_sequences(sequences, maxlen=20, padding='post')

# Prepare labels
labels = to_categorical(df['CategoryEncoded'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


In [163]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=20),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')  # 3 categories
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


In [164]:
history = model.fit(
    X_train, y_train,
    epochs=10,
    validation_data=(X_test, y_test),
    batch_size=32
)


Epoch 1/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.4608 - loss: 1.0499 - val_accuracy: 0.6775 - val_loss: 0.8350
Epoch 2/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7604 - loss: 0.7327 - val_accuracy: 0.7870 - val_loss: 0.5443
Epoch 3/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.8649 - loss: 0.4432 - val_accuracy: 0.8276 - val_loss: 0.4315
Epoch 4/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9087 - loss: 0.3085 - val_accuracy: 0.8438 - val_loss: 0.4044
Epoch 5/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9239 - loss: 0.2472 - val_accuracy: 0.8398 - val_loss: 0.4068
Epoch 6/10
[1m62/62[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9297 - loss: 0.2146 - val_accuracy: 0.8357 - val_loss: 0.4237
Epoch 7/10
[1m62/62[0m [32m━━━━━━━━━

In [165]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")


[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8396 - loss: 0.4571
Test Accuracy: 0.84


In [166]:
def predict_category(note):
    seq = tokenizer.texts_to_sequences([note])
    padded = pad_sequences(seq, maxlen=20, padding='post')
    pred = model.predict(padded)
    return label_encoder.inverse_transform([np.argmax(pred)])

# Example:
predict_category(" Food ")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step


array(['Food'], dtype=object)

In [167]:

predict_category("Ordered pizza ")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step


array(['Food'], dtype=object)

In [168]:
predict_category("Purchased books for reading")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


array(['Miscellaneous'], dtype=object)

In [169]:
predict_category("Paid for food delivery from Swiggy")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step


array(['Food'], dtype=object)

In [170]:
predict_category("Biryani in lunch")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step


array(['Food'], dtype=object)

In [171]:
predict_category("Vegetables")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


array(['Food'], dtype=object)

In [172]:
predict_category("Netflix subscription renewed")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step


array(['Fixed Expenses'], dtype=object)

In [173]:
predict_category("Paid monthly rent")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step


array(['Miscellaneous'], dtype=object)

In [174]:
predict_category("Yearly tuition fees submitted")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


array(['Miscellaneous'], dtype=object)

In [175]:
predict_category("Bought grocery")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


array(['Food'], dtype=object)

In [176]:

predict_category("Ordered pizza for party")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step


array(['Food'], dtype=object)

In [177]:

predict_category("Dinner at Barbeque Nation")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step


array(['Food'], dtype=object)

In [178]:

predict_category("Vegetables and fruits")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step


array(['Food'], dtype=object)

In [112]:
df

Unnamed: 0,Note,Category,Date,Amount,IncomeExpense,CategoryEncoded
0,2 place 5 to place 0,Fixed Expenses,2018-09-20 12:04:08,30.0,expense,0
1,idli medu vada mix 2 plates,Food,2018-09-20 12:03:15,60.0,expense,1
3,data booster pack,Fixed Expenses,2018-09-17 23:41:17,19.0,expense,0
4,ganesh idol,Miscellaneous,2018-09-16 17:15:08,251.0,expense,2
5,permanent residence - tata play recharge,Fixed Expenses,2018-09-15 06:34:17,200.0,expense,0
...,...,...,...,...,...,...
2416,3 bananas,Food,2015-01-14 23:38:24,13.0,expense,1
2417,lunch - chicken fried rice + chicken soup,Food,2015-01-14 15:35:57,120.0,expense,1
2418,travels - mumbai to brc,Fixed Expenses,2015-01-14 12:42:56,760.0,expense,0
2419,internet renewal,Miscellaneous,2015-01-13 18:52:47,500.0,expense,2


In [155]:
df['Category'].value_counts()


Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Salary,4
Dining,4
Movie,3
Electricity,3
Freelance,3
Rent,3
Bus Pass,3
Subscription,3
Groceries,1
Internet,1


In [99]:
print(predict_category("Netflix subscription renewed"))
print(predict_category("Paid rent"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
['Fixed Expenses']
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
['Miscellaneous']


In [154]:
df.count()

Unnamed: 0,0
Date,28
Amount,28
Category,28
Subcategory,28
Notes,28
IncomeExpense,28


In [77]:
df


Unnamed: 0,Date,Mode,Category,Subcategory,Note,Amount,IncomeExpense,Currency
0,20/09/2018 12:04:08,cash,Fixed Expenses,train,2 place 5 to place 0,30.0,expense,inr
1,20/09/2018 12:03:15,cash,Food,snacks,idli medu vada mix 2 plates,60.0,expense,inr
2,19/09/2018,saving bank account 1,Fixed Expenses,netflix,1 month subscription,199.0,expense,inr
3,17/09/2018 23:41:17,saving bank account 1,Fixed Expenses,mobile service provider,data booster pack,19.0,expense,inr
4,16/09/2018 17:15:08,cash,Miscellaneous,ganesh pujan,ganesh idol,251.0,expense,inr
...,...,...,...,...,...,...,...,...
2456,1/1/2015,cash,Fixed Expenses,unknown,share jeep - place t base to top,20.0,expense,inr
2457,1/1/2015,cash,Fixed Expenses,unknown,share auto - place h to place t base,20.0,expense,inr
2458,1/1/2015,cash,Fixed Expenses,unknown,bus - brc to place h,30.0,expense,inr
2459,1/1/2015,cash,Food,unknown,tea,10.0,expense,inr


In [113]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings("ignore")

def train_and_forecast_next_month(df, input_month):
    # Ensure datetime and amount columns are correct
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')
    df.dropna(subset=['Date', 'Amount'], inplace=True)

    # Monthly aggregation
    monthly_expense = df.groupby(df['Date'].dt.to_period('M'))['Amount'].sum().to_timestamp()

    # Convert input_month to datetime
    try:
        input_date = pd.to_datetime(input_month)
    except:
        return {"❌ Error": "Please enter date in YYYY-MM format."}

    if input_date not in monthly_expense.index:
        return {"❌ Error": f"No data available for {input_month}"}

    # Train data up to input_month only
    train_data = monthly_expense[monthly_expense.index <= input_date]

    if len(train_data) < 3:
        return {"❌ Error": "Not enough data to train model."}

    # Train ARIMA model
    model = ARIMA(train_data, order=(1, 1, 1))
    model_fit = model.fit()

    # Forecast next month
    prediction = model_fit.forecast(steps=1)[0]
    last_expense = train_data[-1]

    next_month = (input_date + pd.DateOffset(months=1)).strftime('%B %Y')
    lower_bound = max(0, prediction - 5000)
    upper_bound = prediction + 5000

    return {
        "📅 Current Month": input_date.strftime('%B %Y'),
        "💸 Actual Expense": f"₹{last_expense:.2f}",
        f"📈 Forecast for {next_month}": f"₹{prediction:.2f}",
        "📊 Expected Range": f"₹{lower_bound:.2f} – ₹{upper_bound:.2f}"
    }


In [114]:
result = train_and_forecast_next_month(df, '2017-06')
for k, v in result.items():
    print(f"{k}: {v}")


📅 Current Month: June 2017
💸 Actual Expense: ₹271335.50
📈 Forecast for July 2017: ₹168026.95
📊 Expected Range: ₹163026.95 – ₹173026.95


In [116]:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.dropna(subset=['Date'], inplace=True)
df['YearMonth'] = df['Date'].dt.to_period('M').astype(str)
# Convert income/expense to lowercase
df['IncomeExpense'] = df['IncomeExpense'].fillna('').str.lower()

# Assign signed values
df['SignedAmount'] = df.apply(
    lambda row: row['Amount'] if 'income' in row['IncomeExpense'] else -row['Amount'], axis=1
)

# Calculate monthly net savings
monthly = df.groupby('YearMonth')['SignedAmount'].sum().reset_index()
monthly.rename(columns={'SignedAmount': 'NetSavings'}, inplace=True)

In [122]:
import pandas as pd

# Load and preprocess data
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df.dropna(subset=['Date'], inplace=True)
df['YearMonth'] = df['Date'].dt.to_period('M').astype(str)
df['IncomeExpense'] = df['IncomeExpense'].fillna('').str.lower()

# Assign signed values based on income/expense
df['SignedAmount'] = df.apply(
    lambda row: row['Amount'] if 'income' in row['IncomeExpense'] else -row['Amount'], axis=1
)

# 📌 Define Investment Label Logic
def label_investment_plan(savings):
    if savings < 2000:
        return 'Emergency Fund'
    elif savings < 5000:
        return 'Small SIP'
    elif savings < 10000:
        return 'SIP + Mutual Fund'
    elif savings < 20000:
        return 'SIP + Stocks + Gold'
    else:
        return 'Full Diversification'

# 🚀 Function to Get Investment Suggestion for a Given Month
def get_investment_suggestion(month: str):
    monthly_savings = df[df['YearMonth'] == month]['SignedAmount'].sum()
    suggestion = label_investment_plan(monthly_savings)
    return {
        "Month": month,
        "Net Savings": f"₹{monthly_savings:.2f}",
        "Investment Suggestion": suggestion
    }

# 🔍 Example Usage
result = get_investment_suggestion("2018-01")
print(result)


{'Month': '2018-01', 'Net Savings': '₹-77022.04', 'Investment Suggestion': 'Emergency Fund'}


In [123]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Step 1: Prepare monthly data
monthly = df.groupby('YearMonth')['SignedAmount'].sum().reset_index()
monthly.columns = ['YearMonth', 'NetSavings']

# Step 2: Label data with your existing rule
monthly['InvestmentPlan'] = monthly['NetSavings'].apply(label_investment_plan)

# Step 3: Encode labels for ML
le = LabelEncoder()
monthly['PlanEncoded'] = le.fit_transform(monthly['InvestmentPlan'])

# Step 4: Train model
X = monthly[['NetSavings']]
y = monthly['PlanEncoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Step 5: Evaluate
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


                      precision    recall  f1-score   support

      Emergency Fund       1.00      1.00      1.00         1
Full Diversification       1.00      1.00      1.00         8

            accuracy                           1.00         9
           macro avg       1.00      1.00      1.00         9
        weighted avg       1.00      1.00      1.00         9



In [124]:
def predict_investment_plan_ml(savings_amount):
    pred_label = rf_model.predict([[savings_amount]])[0]
    return le.inverse_transform([pred_label])[0]

# Example: Predict for ₹8200
prediction = predict_investment_plan_ml(8200)
print(f"Predicted Plan: {prediction}")


Predicted Plan: Emergency Fund


In [129]:
from tensorflow.keras.models import load_model

# Save the model
model.save('cnn_model.h5')




In [130]:
import joblib

# Save tokenizer
joblib.dump(tokenizer, 'tokenizer.pkl')

# Save label encoder
joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']

In [132]:
joblib.dump(rf_model, 'rf_model.pkl')                # Forecasting model



['rf_model.pkl']

In [133]:
import pickle

# your fitted ARIMA model is called `fitted_model`


In [135]:
with open('arima_model.pkl', 'wb') as f:
    pickle.dump(result, f)


In [140]:
from google.colab import files

# CNN model
files.download('cnn_model.h5')           # or files.download('cnn_model.h5')
files.download('tokenizer.pkl')
files.download('label_encoder.pkl')

# Random Forest model
files.download('rf_model.pkl')


# ARIMA model
files.download('arima_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>