In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Replace 'sample_data.xlsx' with the actual file path of your Excel file
file_path = '../New folder/Book1.xlsx'

# Load the data from the Excel file into a DataFrame
df = pd.read_excel(file_path)


In [4]:
# Display the first few rows of the DataFrame to inspect the data
print(df.head())

                 Date         Location        Branch             Name  \
0 2021-04-28 16:35:00  Greater Kampala  METRO BRANCH  TECH247 LIMITED   
1 2021-04-28 17:18:00  Greater Kampala  METRO BRANCH  TECH247 LIMITED   
2 2021-04-28 17:18:00  Greater Kampala  METRO BRANCH  TECH247 LIMITED   
3 2021-04-28 17:22:00  Greater Kampala  METRO BRANCH  TECH247 LIMITED   
4 2021-04-28 17:23:00  Greater Kampala  METRO BRANCH  TECH247 LIMITED   

   Terminal  Agent Account       Customer                    Biller  \
0  23022862  9030018800879      757975012  Mobile Money Liquidation   
1  23022862  9030018800879  4458089439593  Mobile Money Liquidation   
2  23022862  9030018800879      789734361  Mobile Money Liquidation   
3  23022862  9030018800879      789734361  Mobile Money Liquidation   
4  23022862  9030018800879  3543959460961  Mobile Money Liquidation   

                       Item  Amount   Surcharge  \
0  Airtel Float Liquidation   500000          0   
1     MTN Float Liquidation  200

Step 1: Data Parsing and Type Conversion

In [5]:

df['Date'] = pd.to_datetime(df['Date'])




In [6]:
# Step 2: Handling Missing Values
# In this example, we use different strategies for different columns with missing values.

# Handling missing values in 'Status' column by filling with 'Unknown'
df['Status'].fillna('Unknown', inplace=True)

# Handling missing values in 'Surcharge' column by filling with 0
df['Surcharge'].fillna(0, inplace=True)



In [10]:
df.columns = df.columns.str.strip()

In [11]:
import re
# Step 3: Data Cleaning for 'Amount' column
# Remove non-numeric characters, convert to string, then back to float
df['Amount'] = df['Amount'].astype(str).apply(lambda x: re.sub(r'[^\d.]', '', x)).astype(float)


In [12]:
# Step 4: Feature Engineering (Example: Extracting hour from the Date)
df['Hour'] = df['Date'].dt.hour



In [13]:
# Step 5: Feature Encoding (Example: One-Hot Encoding for categorical columns)
df = pd.get_dummies(df, columns=['Location', 'Branch', 'Status'], drop_first=True)



In [20]:
# Step 6: Data Splitting (Example: 80% for training, 20% for testing)
X = df.drop(['Date', 'Status_Success'], axis=1)  # Features (excluding 'Date' and the target column 'Status_Success')
y = df['Status_Success']  # Target column




In [21]:
# Identify columns with non-numeric values
non_numeric_columns = X.select_dtypes(exclude=[np.number]).columns.tolist()

# One-hot encode categorical columns with sparse encoding
X = pd.get_dummies(X, columns=non_numeric_columns, drop_first=True, sparse=True)

In [22]:
# Step 7: Train the Random Forest Model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train, y_train)




MemoryError: Unable to allocate 58.8 GiB for an array with shape (150818, 52365) and data type float64

In [None]:
# Step 8: Make Predictions
y_pred = random_forest_model.predict(X_test)

In [None]:
# Step 9: Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)