In [10]:
# Step 1: load The Dataset 

In [12]:
import numpy as np
import pandas as pd

df = pd.read_csv('PS_20174392719_1491204439457_log.csv')
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


In [13]:
# Step 2: Creat Synthetic Transaction Descriptions

In [14]:
def creat_transaction_description(row):
    return(f"At step {row['step']}, a {row['type']} of ${row['amount']} was made "
           f"from {row['nameOrig']}(Old Balance: $ {row['oldbalanceOrg']}, New balance: $ {row['newbalanceOrig']})"
 f"to {row['nameDest']} (Old Balance: $ {row['oldbalanceDest']}, New balance: $ {row['newbalanceDest']}) .")

# Apply the function to create a new 'description' column
df['description'] = df.apply(creat_transaction_description, axis=1)  

# Display the first few descriptions
print(df[['description']].head(30))   


                                          description
0   At step 1, a PAYMENT of $9839.64 was made from...
1   At step 1, a PAYMENT of $1864.28 was made from...
2   At step 1, a TRANSFER of $181.0 was made from ...
3   At step 1, a CASH_OUT of $181.0 was made from ...
4   At step 1, a PAYMENT of $11668.14 was made fro...
5   At step 1, a PAYMENT of $7817.71 was made from...
6   At step 1, a PAYMENT of $7107.77 was made from...
7   At step 1, a PAYMENT of $7861.64 was made from...
8   At step 1, a PAYMENT of $4024.36 was made from...
9   At step 1, a DEBIT of $5337.77 was made from C...
10  At step 1, a DEBIT of $9644.94 was made from C...
11  At step 1, a PAYMENT of $3099.97 was made from...
12  At step 1, a PAYMENT of $2560.74 was made from...
13  At step 1, a PAYMENT of $11633.76 was made fro...
14  At step 1, a PAYMENT of $4098.78 was made from...
15  At step 1, a CASH_OUT of $229133.94 was made f...
16  At step 1, a PAYMENT of $1563.82 was made from...
17  At step 1, a PAYMENT of 

In [15]:
# Step 3: Preprocessing Text Using NLP Techniques

In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Initialize Lemmatizer & Stop Words
lemmatizer = WordNetLemmatizer()  # Create a lemmatizer Object
stop_words = set(stopwords.words('english'))   # Get the List of Stop Words

# Function to Preprocess 
def preprocess_text(text):
     # Step 1: Tokenize
    tokens = word_tokenize(text.lower()) 

    # Step 2: Lemmatzie and remove stop words
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]  

    # Step 3: Join tokens back into a single string
    return ' '.join(tokens)  


# Apply preprocessing to the 'description' column
df['cleaned_description'] = df['description'].apply(preprocess_text)
print(df[['cleaned_description']].head(30))



                                  cleaned_description
0   step 1 payment made c1231006815 old balance ne...
1   step 1 payment made c1666544295 old balance ne...
2   step 1 transfer made c1305486145 old balance n...
3   step 1 made c840083671 old balance new balance...
4   step 1 payment made c2048537720 old balance ne...
5   step 1 payment made c90045638 old balance new ...
6   step 1 payment made c154988899 old balance new...
7   step 1 payment made c1912850431 old balance ne...
8   step 1 payment made c1265012928 old balance ne...
9   step 1 debit made c712410124 old balance new b...
10  step 1 debit made c1900366749 old balance new ...
11  step 1 payment made c249177573 old balance new...
12  step 1 payment made c1648232591 old balance ne...
13  step 1 payment made c1716932897 old balance ne...
14  step 1 payment made c1026483832 old balance ne...
15  step 1 made c905080434 old balance new balance...
16  step 1 payment made c761750706 old balance new...
17  step 1 payment made c123

In [17]:
# Step 4: Feature Extraction 

In [48]:
import pandas as pd

num_iterations = 16
sample_size = 5000

# Initialize a list to store each sample
samples_list = []

# Loop through 16 times to get different random samples
for i in range(num_iterations):
    
    # Randomly sample 5000 rows in each iteration
    df_sample = df.sample(n=sample_size, random_state=i)
    df_sample = df_sample.reset_index(drop=True)
    
    # Store the sampled data
    samples_list.append(df_sample)
    
    # Display the shape of each sample
    print(f"Sample {i+1} shape: {df_sample.shape}")

# After this, 'samples_list' contains 16 separate DataFrames each with 5000 randomly selected rows.

from sklearn.feature_extraction.text import TfidfVectorizer

#Intialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)   

# Loop through each sample in the list and apply TF-IDF
for i, sample in enumerate(samples_list):
    
    # Ensure that 'cleaned_description' column exists in the sample
    if 'cleaned_description' in sample.columns:
        
        # Fit and transform the 'cleaned_description' column using TF-IDF
        X_tfidf = vectorizer.fit_transform(sample['cleaned_description'])
        
        # Convert the sparse matrix to a dense array
        X_tfidf_array = X_tfidf.toarray()
        
        # Display the shape of the TF-IDF matrix for each sample
        print(f"Sample {i+1} TF-IDF shape: {X_tfidf_array.shape}")
    else:
        print(f"Sample {i+1} does not contain 'cleaned_description' column.")  


Sample 1 shape: (5000, 13)
Sample 2 shape: (5000, 13)
Sample 3 shape: (5000, 13)
Sample 4 shape: (5000, 13)
Sample 5 shape: (5000, 13)
Sample 6 shape: (5000, 13)
Sample 7 shape: (5000, 13)
Sample 8 shape: (5000, 13)
Sample 9 shape: (5000, 13)
Sample 10 shape: (5000, 13)
Sample 11 shape: (5000, 13)
Sample 12 shape: (5000, 13)
Sample 13 shape: (5000, 13)
Sample 14 shape: (5000, 13)
Sample 15 shape: (5000, 13)
Sample 16 shape: (5000, 13)
Sample 1 TF-IDF shape: (5000, 1000)
Sample 2 TF-IDF shape: (5000, 1000)
Sample 3 TF-IDF shape: (5000, 1000)
Sample 4 TF-IDF shape: (5000, 1000)
Sample 5 TF-IDF shape: (5000, 1000)
Sample 6 TF-IDF shape: (5000, 1000)
Sample 7 TF-IDF shape: (5000, 1000)
Sample 8 TF-IDF shape: (5000, 1000)
Sample 9 TF-IDF shape: (5000, 1000)
Sample 10 TF-IDF shape: (5000, 1000)
Sample 11 TF-IDF shape: (5000, 1000)
Sample 12 TF-IDF shape: (5000, 1000)
Sample 13 TF-IDF shape: (5000, 1000)
Sample 14 TF-IDF shape: (5000, 1000)
Sample 15 TF-IDF shape: (5000, 1000)
Sample 16 TF-ID

In [49]:
# Step 5: Combining Text Features with Numerical Data

In [72]:
import numpy as mp 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Convert the accuracy of the model to speech
import pyttsx3
import numpy as np

def text_to_speech(text):
    engine = pyttsx3.init()   # Initialize the pyttsx3
    engine.say(text)
    engine.runAndWait()   # Ensure the speech is processed before moving on to the next iteration
    engine.setProperty('rate', 150)  # Speed of speech
    engine.setProperty('volume', 1)  # Volume level (0.0 to 1.0)

# Use 'samples_list' which is a list of DataFrames, each containing the 'cleaned_description' and numerical features.
for i, df_sample in enumerate(samples_list):
    
    #Select Numerical features
    numerical_features = df_sample[['amount','oldbalanceOrg','newbalanceOrig','oldbalanceDest','newbalanceDest' ]].values
    
    #Combine TF-IDF features with numerical features
    X_combined = np.hstack((X_tfidf.toarray(), numerical_features))
    
    # Define the target variable: "isFraud"
    y = df_sample['isFraud']
    
    # Split data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size = 0.2, random_state = 42)

    #Train a Random Forest model
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    # Evaluation
    accuracy = model.score(X_test, y_test)
    print(f"Sample {i+1} Model Accuracy: {accuracy}")
    
    # Print and convert accuracy to speech
    text_to_speech(f"Sample {i+1} Its Model Accuracy is {accuracy}")

Sample 1 Model Accuracy: 0.999
Sample 2 Model Accuracy: 0.998
Sample 3 Model Accuracy: 1.0
Sample 4 Model Accuracy: 0.999
Sample 5 Model Accuracy: 1.0
Sample 6 Model Accuracy: 1.0
Sample 7 Model Accuracy: 0.999
Sample 8 Model Accuracy: 0.999
Sample 9 Model Accuracy: 1.0
Sample 10 Model Accuracy: 1.0
Sample 11 Model Accuracy: 0.998
Sample 12 Model Accuracy: 1.0
Sample 13 Model Accuracy: 0.998
Sample 14 Model Accuracy: 0.999
Sample 15 Model Accuracy: 0.999
Sample 16 Model Accuracy: 0.998
