In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
path_file_csv = '/content/dataset fraud v5.csv'
fraud_df = pd.read_csv(path_file_csv)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(fraud_df['message'], fraud_df['label'], test_size=0.2, random_state=42)

In [42]:

count_label_0 = fraud_df[fraud_df['label'] == 0].shape[0]
count_label_1 = fraud_df[fraud_df['label'] == 1].shape[0]
print('Dataset size:',fraud_df.shape)
print('Columns are:',fraud_df.columns)
print(f"Jumlah pesan dengan label 0: {count_label_0}")
print(f"Jumlah pesan dengan label 1: {count_label_1}")

Dataset size: (1437, 2)
Columns are: Index(['message', 'label'], dtype='object')
Jumlah pesan dengan label 0: 822
Jumlah pesan dengan label 1: 615


In [43]:
#convert text data to numeric before applying SMOTE , like below.
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X_train.values.ravel())
X_train=vectorizer.transform(X_train.values.ravel())
X_test=vectorizer.transform(X_test.values.ravel())
X_train=X_train.toarray()
X_test=X_test.toarray()

In [44]:
from imblearn.over_sampling import SMOTE

X_resample, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [45]:
print("Number of samples in X_train:", X_train.shape[0])
print("Number of samples in X_test:", X_test.shape[0])

Number of samples in X_train: 1149
Number of samples in X_test: 288


In [46]:
import collections
from collections import Counter
print('before balancing :-', Counter(y_train))
print('after balancing :-', Counter(y_resampled))

before balancing :- Counter({0: 655, 1: 494})
after balancing :- Counter({0: 655, 1: 655})


In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming X_resample and y_resampled are the oversampled features and labels
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_resample, y_resampled)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       167
           1       0.93      0.93      0.93       121

    accuracy                           0.94       288
   macro avg       0.94      0.94      0.94       288
weighted avg       0.94      0.94      0.94       288



In [48]:
new_text = ["saya dari tim Baim Wong, memberitahukan bahwa anda mendapat hadiah 80JT"]
new_text_vectorized = vectorizer.transform(new_text).toarray()

predictions = model.predict(new_text_vectorized)

print("Predictions:", predictions)

Predictions: [1]


In [36]:
# Import necessary libraries
import numpy as np

# Print shape and data type of X
print("X_resample shape:", X_resample.shape)
print("X_resample data type:", type(X_resample))
print("X_resample data type of elements:", X_resample.dtype)
print("Example data in X_resample:", X_resample[0])  # Assuming X_resample is a NumPy array or similar

# Print shape and data type of y
print("\ny_resampled shape:", y_resampled.shape)
print("y_resampled data type:", type(y_resampled))
print("y_resampled data type of elements:", y_resampled.dtype)
print("Example data in y_resampled:", y_resampled[0])

X_resample shape: (1310, 5657)
X_resample data type: <class 'numpy.ndarray'>
X_resample data type of elements: int64
Example data in X_resample: [0 0 0 ... 0 0 0]

y_resampled shape: (1310,)
y_resampled data type: <class 'pandas.core.series.Series'>
y_resampled data type of elements: int64
Example data in y_resampled: 0


In [37]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import classification_report

# Assuming X_resample and y_resampled are the oversampled features and labels
# Convert data types to float32 for TensorFlow
X_resample = X_resample.astype('float32')
y_resampled = y_resampled.astype('float32')
X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

# Build a simple neural network
model = Sequential()
model.add(Dense(64, input_dim=X_resample.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_resample, y_resampled, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f2d828e91b0>

In [38]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype('float32')  # Convert probabilities to binary predictions

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96       167
         1.0       0.94      0.94      0.94       121

    accuracy                           0.95       288
   macro avg       0.95      0.95      0.95       288
weighted avg       0.95      0.95      0.95       288



In [39]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                362112    
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 362177 (1.38 MB)
Trainable params: 362177 (1.38 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [49]:
new_text = ["saya dari tim Baim Wong, memberitahukan bahwa anda mendapat hadiah 80JT"]
new_text_vectorized = vectorizer.transform(new_text).toarray()

# Make predictions (probabilities)
predicted_probabilities = model.predict_proba(new_text_vectorized)

print("Predicted Probabilities:", predicted_probabilities)

Predicted Probabilities: [[0.06611865 0.93388135]]


In [None]:
predicted_probabilities_class_1 = predicted_probabilities[:, 1]
print("Predicted Probabilities for Class 1:", predicted_probabilities_class_1)


In [None]:
pip install easyocr

In [53]:
import easyocr
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [54]:
def ocr_image(image_path):
    reader = easyocr.Reader(['id'])  # You can specify the language(s) supported by your OCR model
    result = reader.readtext(image_path)
    text = ' '.join([entry[1] for entry in result])
    return text


In [60]:
def preprocess_text(text, vectorizer):
    # Assuming the vectorizer is already trained and loaded
    text_vectorized = vectorizer.transform([text]).toarray()
    return text_vectorized

# Example usage
image_path = '/content/abd test.jpg'
image_text = ocr_image(image_path)
preprocessed_text = preprocess_text(image_text, vectorizer)



In [61]:
text = ocr_image(image_path)

# Mencetak teks hasil konversi dari gambar
print("Detected Text:")
print(text)



Detected Text:
Jaga akun sosmed, perbankan & data penting lainnya dari SCAMMING! Pakai IMSecure; BEBAS akses 90 hari. Cek di: bit lyZ imsecure-ret


In [62]:
# Make predictions
predictions = model.predict(preprocessed_text)
print("Predictions:", predictions)

Predictions: [0]
