In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Load the dataset from CSV
df = pd.read_csv('Restaurant_Reviews.csv')  # Replace 'your_dataset.csv' with the actual filename

# Tokenization
df['tokens'] = df['Review'].apply(lambda x: word_tokenize(x.lower()))  # Assuming 'customer_review' is the column containing reviews

# TF-IDF
corpus = [' '.join(tokens) for tokens in df['tokens']]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Define labels (positive/negative sentiment)
labels = df['Liked'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=50)

# Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, predictions))


Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.78      0.78      0.78       105
           1       0.76      0.76      0.76        95

    accuracy                           0.77       200
   macro avg       0.77      0.77      0.77       200
weighted avg       0.77      0.77      0.77       200



In [None]:
# Load the datasets
transactions = pd.read_csv('transactions.csv')
cc_info = pd.read_csv('cc_info.csv')

# Merge the datasets based on the 'credit_card' column
merged_data = pd.merge(transactions, cc_info, on='credit_card', how='left')

# Feature Engineering
merged_data['hour_of_day'] = pd.to_datetime(merged_data['date']).dt.hour

# Scaling Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(merged_data[['transaction_dollar_amount', 'Long', 'Lat', 'credit_card_limit', 'hour_of_day']])

scaled_data = pd.DataFrame(scaled_features, columns=['transaction_dollar_amount', 'Long', 'Lat', 'credit_card_limit', 'hour_of_day'])

# Concatenate the scaled features with other relevant features
processed_data = pd.concat([merged_data[['credit_card', 'date']], scaled_data], axis=1)

# Define a threshold based on your specific data characteristics
threshold = 0.5  # Adjust this based on your data

# Assuming 'fraud_label' is a binary label indicating fraud or not fraud
processed_data['fraud_label'] = processed_data['transaction_dollar_amount'].apply(lambda x: 1 if x > threshold else 0)

X = processed_data[['transaction_dollar_amount', 'Long', 'Lat', 'credit_card_limit', 'hour_of_day']]
y = processed_data['fraud_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9994568722631454
Confusion Matrix:
 [[52394     0]
 [   32  6492]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     52394
           1       1.00      1.00      1.00      6524

    accuracy                           1.00     58918
   macro avg       1.00      1.00      1.00     58918
weighted avg       1.00      1.00      1.00     58918

