In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('../artifacts/user_data x 100000.csv')

In [3]:
data.head()

Unnamed: 0,User ID,Account Creation Date,Account Creation Method,Account Age (Days),Profile Picture,Profile Information Complete,Name Consistency,Login Frequency (per week),Login Time Pattern,IP Address Variability,...,Post Frequency (per week),Content Type,Email Validation,Phone Number Validation,Reported by Users,Flags by Content Moderation,Geolocation Consistency,Suspicious Device Use,Cross-Platform Match,Final Result
0,cce31e7d-6b28-44f4-b584-12b7cd00cb96,2024-04-04,Social Media Login,16,No,Yes,Inconsistent,2,Regular,Low,...,1,Mixed (Image/Text),Invalid,Valid,No,Yes,Consistent,Yes,Matched on Facebook,Genuine
1,7533f808-0db0-46cf-b122-07383c397da9,2024-01-13,Social Media Login,22,No,No,Consistent,3,Regular,Low,...,7,Mixed (Image/Text),Invalid,Valid,Yes,Yes,Inconsistent,No,Matched on Facebook,Fake
2,747c6dac-fa18-4d47-9ba3-52e16d4721ff,2024-08-22,Social Media Login,26,Yes,Yes,Inconsistent,4,Regular,High,...,17,Image/Video,Invalid,Invalid,No,Yes,Consistent,Yes,Matched on Instagram,Fake
3,7353b2e2-0371-4f70-9937-6e2b6c39a768,2024-07-02,Phone Number,11,No,No,Inconsistent,4,Regular,Low,...,14,Image/Video,Valid,Invalid,Yes,No,Inconsistent,No,Matched on Instagram,Genuine
4,1542d84b-e317-4d55-bf57-dbed392718a7,2024-08-06,Phone Number,2,No,No,Consistent,1,Regular,Low,...,7,Text-only,Valid,Valid,No,No,Consistent,Yes,Not matched,Genuine


In [4]:
### data preprocessing 
data.shape


(100000, 24)

In [5]:
def convert_date_to_binary(date_str):
    # Remove both '/' and '-' characters from the date string
    cleaned_date = date_str.replace('/', '').replace('-', '')
    
    # Convert each digit to its binary representation (4 digits each)
    binary_value = ''.join(format(int(digit), '04b') for digit in cleaned_date)
    
    return binary_value

data["Account Creation Date"] = data["Account Creation Date"].apply(convert_date_to_binary)


In [6]:
def assign_value_based_on_type(value):
    if value == 'Social Media Login':
        return format(0, '04b')  # Convert 0 to a 4-bit binary string
    elif value == 'Email':
        return format(1, '04b')  # Convert 1 to a 4-bit binary string
    elif value == 'Phone Number':
        return format(2, '04b')  # Convert 3 to a 4-bit binary string
    else:
        return None  # Return None if the value doesn't match any known type


data["Account Creation Method"] = data["Account Creation Method"].apply(assign_value_based_on_type)


In [7]:
def number_to_fixed_length_binary(num):
    # Convert number to string and pad with leading zeros to ensure 5-digit length
    num_str = str(num).zfill(5)  # Ensures it has exactly 5 digits
    
    # Convert each digit to a 4-bit binary representation
    binary_digits = [format(int(digit), '04b') for digit in num_str]
    
    return ''.join(binary_digits)  # Return as a continuous binary string

data["Account Age (Days)"] = data["Account Age (Days)"].apply(number_to_fixed_length_binary)


In [8]:
def profile_pic_conv(value):
    if value == "Yes":
        return 1
    elif value == "No":
        return 0
    else:
        return None  # Corrected 'none' to 'None'
data["Profile Picture"] = data["Profile Picture"].apply(profile_pic_conv)



In [9]:
def profile_info_compl(value):
    if value == "Yes":
        return 1
    elif value == "No":
        return 0
    else:
        return None  # Corrected 'none' to 'None'
data["Profile Information Complete"] = data["Profile Information Complete"].apply(profile_pic_conv)

In [10]:
def Name_Consistency_con (value):
    if value == "Consistent":
        return 1
    elif value == "Inconsistent":
        return 0
    else:
        return None  # Corrected 'none' to 'None'
data["Name Consistency"] = data["Name Consistency"].apply(Name_Consistency_con)

In [11]:
def Login_Frequency (value):
   
    if isinstance(value, int) and 0 <= value <= 9:
        return format(value, '04b')  # Convert to 4-bit binary
    else:
        return "Invalid input"  # Handle incorrect values


data["Login Frequency (per week)"] = data["Login Frequency (per week)"].apply(Login_Frequency)

In [12]:
def Login_Time_Pattern (value):
    if value == "Regular":
        return 1
    elif value == "Irregular":
        return 0
    else:
        return None  # Corrected 'none' to 'None'
data["Login Time Pattern"] = data["Login Time Pattern"].apply(Login_Time_Pattern)

In [13]:
def IP_Address_Variability(value):
    if value == 'Low':
        return format(0, '04b')  # Convert 0 to a 4-bit binary string
    elif value == 'Medium':
        return format(1, '04b')  # Convert 1 to a 4-bit binary string
    elif value == 'High':
        return format(2, '04b')  # Convert 3 to a 4-bit binary string
    else:
        return None  # Return None if the value doesn't match any known type


data["IP Address Variability"] = data["IP Address Variability"].apply(IP_Address_Variability)

In [14]:
def Followers_Count(num):
    # Convert number to string and pad with leading zeros to ensure 5-digit length
    num_str = str(num).zfill(6)  # Ensures it has exactly 6 digits
    
    # Convert each digit to a 4-bit binary representation
    binary_digits = [format(int(digit), '04b') for digit in num_str]
    
    return ''.join(binary_digits)  # Return as a continuous binary string

data["Followers Count"] = data["Followers Count"].apply(Followers_Count)


In [15]:
def Following_Count(num):
    # Convert number to string and pad with leading zeros to ensure 5-digit length
    num_str = str(num).zfill(6)  # Ensures it has exactly 6 digits
    
    # Convert each digit to a 4-bit binary representation
    binary_digits = [format(int(digit), '04b') for digit in num_str]
    
    return ''.join(binary_digits)  # Return as a continuous binary string

data["Following Count"] = data["Following Count"].apply(Following_Count)

In [16]:
def Engagement_Rate(value):
    if value == 'Low engagement (rare likes)':
        return format(0, '04b')  # Convert 0 to a 4-bit binary string
    elif value == 'Moderate engagement':
        return format(1, '04b')  # Convert 1 to a 4-bit binary string
    elif value == 'High engagement (likes/comments)':
        return format(2, '04b')  # Convert 3 to a 4-bit binary string
    else:
        return None  # Return None if the value doesn't match any known type


data["Engagement Rate (likes/comments)"] = data["Engagement Rate (likes/comments)"].apply(Engagement_Rate)

In [17]:
def Post_Frequency (num):
    # Convert number to string and pad with leading zeros to ensure 5-digit length
    num_str = str(num).zfill(3)  # Ensures it has exactly 6 digits
    
    # Convert each digit to a 4-bit binary representation
    binary_digits = [format(int(digit), '04b') for digit in num_str]
    
    return ''.join(binary_digits)  # Return as a continuous binary string

data["Post Frequency (per week)"] = data["Post Frequency (per week)"].apply(Post_Frequency)

In [18]:
def Content_Type(value):
    if value == 'Text-only':
        return format(0, '04b')  # Convert 0 to a 4-bit binary string
    elif value == 'Image/Video':
        return format(1, '04b')  # Convert 1 to a 4-bit binary string
    elif value == 'Mixed (Image/Text)':
        return format(2, '04b')  # Convert 3 to a 4-bit binary string
    else:
        return None  # Return None if the value doesn't match any known type


data["Content Type"] = data["Content Type"].apply(Content_Type)

In [19]:
def valid_or_not_function (value):
    if value == "Valid":
        return 1
    elif value == "Invalid":
        return 0
    else:
        return None  # Corrected 'none' to 'None'
# email validation        
data["Email Validation"] = data["Email Validation"].apply(valid_or_not_function)
# phone number validation
data["Phone Number Validation"] = data["Phone Number Validation"].apply(valid_or_not_function)

In [20]:
def yes_or_not_function (value):
    if value == "Yes":
        return 1
    elif value == "No":
        return 0
    else:
        return None  # Corrected 'none' to 'None'
# Reported by Users        
data["Reported by Users"] = data["Reported by Users"].apply(yes_or_not_function)
# Flags by Content Moderation
data["Flags by Content Moderation"] = data["Flags by Content Moderation"].apply(yes_or_not_function)
# Suspicious Device Use
data["Suspicious Device Use"] = data["Suspicious Device Use"].apply(yes_or_not_function)

In [21]:
def Geolocation_Consistency (value):
    if value == "Consistent":
        return 1
    elif value == "Inconsistent":
        return 0
    else:
        return None  # Corrected 'none' to 'None'
# Reported by Users        
data["Geolocation Consistency"] = data["Geolocation Consistency"].apply(Geolocation_Consistency)

In [22]:
def Final_Result (value):
    if value == "Genuine":
        return 1
    elif value == "Fake":
        return 0
    else:
        return None  # Corrected 'none' to 'None'
# Reported by Users        
data["Final Result"] = data["Final Result"].apply(Final_Result)

In [23]:
# Count occurrences of each category in "Final Result" column
category_counts = data["Final Result"].value_counts()

# Display the result
print(category_counts)


Final Result
1    69803
0    30197
Name: count, dtype: int64


In [24]:
### devided data set 

In [25]:
!pip install imblearn 



In [26]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE  # Import SMOTE for data balancing
from sklearn.preprocessing import LabelEncoder

# Load dataset (uncomment if using CSV file)
# data = pd.read_csv('your_dataset.csv')

# Handle invalid inputs
data.replace('Invalid input', np.nan, inplace=True)
data.dropna(inplace=True)

# Convert categorical features to numerical
for col in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

# Define features (X) and target (y)
X = data.drop(columns=['Final Result', 'User ID', 'Cross-Platform Match', 'Follow/Follow Ratio'])
y = data['Final Result']

# Print initial data distribution
print("Original Class Distribution:\n", y.value_counts())

# Balance dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Print balanced data distribution
print("\nBalanced Class Distribution:\n", y_resampled.value_counts())

# Split the balanced dataset into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Print the shapes of the splits to verify
print("\nShape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_train = rf_model.predict(X_train)
y_pred_test = rf_model.predict(X_test)

# Evaluate model performance
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

# Print evaluation metrics
print("\nTraining Accuracy: {:.4f}".format(train_accuracy))
print("Test Accuracy: {:.4f}".format(test_accuracy))


Original Class Distribution:
 Final Result
1    69803
0    30197
Name: count, dtype: int64

Balanced Class Distribution:
 Final Result
1    69803
0    69803
Name: count, dtype: int64

Shape of X_train: (97724, 20)
Shape of X_test: (41882, 20)
Shape of y_train: (97724,)
Shape of y_test: (41882,)

Training Accuracy: 1.0000
Test Accuracy: 0.7044


In [27]:
### build a model 
X_train

Unnamed: 0,Account Creation Date,Account Creation Method,Account Age (Days),Profile Picture,Profile Information Complete,Name Consistency,Login Frequency (per week),Login Time Pattern,IP Address Variability,Followers Count,Following Count,Engagement Rate (likes/comments),Post Frequency (per week),Content Type,Email Validation,Phone Number Validation,Reported by Users,Flags by Content Moderation,Geolocation Consistency,Suspicious Device Use
92763,57,1,29,1,0,0,3,1,1,3959,2778,2,7,2,0,0,1,0,1,1
16332,58,2,9,0,1,1,5,1,1,2149,901,1,0,0,0,0,0,1,1,0
126704,92,1,10,1,0,0,3,0,0,2274,2579,1,13,0,0,0,0,1,0,0
36001,242,0,15,1,0,1,0,0,1,4154,3450,2,6,0,0,0,1,0,1,1
84952,88,0,27,1,1,0,4,1,1,3816,3644,0,15,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,217,0,17,1,1,0,5,1,0,1607,1643,1,7,1,1,1,0,0,1,0
119879,144,1,17,0,0,0,2,0,1,4249,3679,0,7,1,0,0,0,1,1,1
103694,42,1,13,0,0,0,5,0,0,1393,1016,0,8,0,0,0,1,0,1,1
131932,95,1,19,0,1,0,1,0,0,4590,3931,2,16,2,0,0,0,0,0,0


In [28]:
### ML Models 

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming 'data' is your dataset
# Sample data preparation (replace this with actual dataset loading)
# data = pd.read_csv("your_data.csv")  # Uncomment and load your dataset here

# Define your target variable (y) and features (X)
X = data.drop(columns=['Final Result', 'User ID', 'Cross-Platform Match', 'Follow/Follow Ratio'])
y = data['Final Result']

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the function to calculate and print training scores
def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

# Define the function to calculate and print validation (testing) scores
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf_train = rf_model.predict(X_train)  # Predictions on training data
y_pred_rf = rf_model.predict(X_test)  # Predictions on test data

# Calculate and print training and validation scores
print("Random Forest Model:")
training_scores(y_train, y_pred_rf_train)  # Evaluate on training data
validation_scores(y_test, y_pred_rf)  # Evaluate on test data

# If you want to print just accuracy
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Test Accuracy: {rf_accuracy:.4f}")


Random Forest Model:
Training Scores:
	Accuracy = 1.0
	Precision = 1.0
	Recall = 1.0
	F1-Score = 1.0
Testing Scores:
	Accuracy = 0.697
	Precision = 0.698
	Recall = 0.998
	F1-Score = 0.822
Random Forest Test Accuracy: 0.6974


In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import pandas as pd

# Assuming 'data' is your dataset
# Sample data preparation (replace this with actual dataset loading)
# data = pd.read_csv("your_data.csv")  # Uncomment and load your dataset here

# Define your target variable (y) and features (X)
X = data.drop(columns=['Final Result', 'User ID', 'Cross-Platform Match', 'Follow/Follow Ratio'])
y = data['Final Result']

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the function to calculate and print training scores
def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

# Define the function to calculate and print validation (testing) scores
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Train the model
logreg_model.fit(X_train, y_train)

# Make predictions
y_pred_logreg_train = logreg_model.predict(X_train)  # Predictions on training data
y_pred_logreg = logreg_model.predict(X_test)  # Predictions on test data

# Calculate and print training and validation scores
print("Logistic Regression Model:")
training_scores(y_train, y_pred_logreg_train)  # Evaluate on training data
validation_scores(y_test, y_pred_logreg)  # Evaluate on test data

# If you want to print just accuracy
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Test Accuracy: {logreg_accuracy:.4f}")


Logistic Regression Model:
Training Scores:
	Accuracy = 0.698
	Precision = 0.698
	Recall = 1.0
	F1-Score = 0.822
Testing Scores:
	Accuracy = 0.698
	Precision = 0.698
	Recall = 1.0
	F1-Score = 0.822
Logistic Regression Test Accuracy: 0.6980


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
### opt

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd

# Assuming 'data' is your dataset
# data = pd.read_csv("your_data.csv")  # Uncomment and load your dataset here

# Define your target variable (y) and features (X)
X = data.drop(columns=['Final Result', 'User ID', 'Cross-Platform Match', 'Follow/Follow Ratio'])
y = data['Final Result']

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the function to calculate and print training scores
def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

# Define the function to calculate and print validation (testing) scores
def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],  # Regularization strength
    'solver': ['liblinear', 'lbfgs'],  # Solvers to try
    'max_iter': [100, 200, 300]  # Maximum number of iterations
}

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(random_state=42)

# Apply GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=logreg_model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from the GridSearchCV
best_params = grid_search.best_params_
print(f"Best Hyperparameters from GridSearchCV: {best_params}")

# Train the best model
best_logreg_model = grid_search.best_estimator_

# Make predictions
y_pred_logreg_train = best_logreg_model.predict(X_train)  # Predictions on training data
y_pred_logreg = best_logreg_model.predict(X_test)  # Predictions on test data

# Calculate and print training and validation scores
print("Logistic Regression Model (Tuned):")
training_scores(y_train, y_pred_logreg_train)  # Evaluate on training data
validation_scores(y_test, y_pred_logreg)  # Evaluate on test data

# If you want to print just accuracy
logreg_accuracy = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Test Accuracy (Tuned): {logreg_accuracy:.4f}")


Best Hyperparameters from GridSearchCV: {'C': 0.01, 'max_iter': 100, 'solver': 'liblinear'}
Logistic Regression Model (Tuned):
Training Scores:
	Accuracy = 0.698
	Precision = 0.698
	Recall = 1.0
	F1-Score = 0.822
Testing Scores:
	Accuracy = 0.698
	Precision = 0.698
	Recall = 1.0
	F1-Score = 0.822
Logistic Regression Test Accuracy (Tuned): 0.6980


In [33]:
import pickle
with open('../static/model/modelLR123.pickle', 'wb') as file:
    pickle.dump(logreg_model, file)

In [34]:
# my model2

In [35]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

def validation_scores(y_act, y_pred):
   acc = round(accuracy_score(y_act, y_pred), 3)
   pr = round(precision_score(y_act, y_pred), 3)
   rec = round(recall_score(y_act, y_pred), 3)
   f1 = round(f1_score(y_act, y_pred), 3)
   print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Scor= {f1}')

In [36]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Standardize feature set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Logistic Regression with specified parameters
lr = LogisticRegression(max_iter=1000, random_state=42, solver='saga')
lr.fit(X_train_scaled, y_train)

# Predictions
y_train_pred = lr.predict(X_train_scaled)
y_test_pred = lr.predict(X_test_scaled)

# Evaluation functions with full metrics
def training_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Training Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

def validation_scores(y_act, y_pred):
    acc = round(accuracy_score(y_act, y_pred), 3)
    pr = round(precision_score(y_act, y_pred), 3)
    rec = round(recall_score(y_act, y_pred), 3)
    f1 = round(f1_score(y_act, y_pred), 3)
    print(f'Testing Scores:\n\tAccuracy = {acc}\n\tPrecision = {pr}\n\tRecall = {rec}\n\tF1-Score = {f1}')

# Evaluate
training_scores(y_train, y_train_pred)
validation_scores(y_test, y_test_pred)


Training Scores:
	Accuracy = 0.698
	Precision = 0.698
	Recall = 1.0
	F1-Score = 0.822
Testing Scores:
	Accuracy = 0.698
	Precision = 0.698
	Recall = 1.0
	F1-Score = 0.822


In [37]:
import pickle
with open('../static/model/model123.pickle', 'wb') as file:
    pickle.dump(lr, file)

In [38]:
### Deep leasring models 

In [39]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Build the MLP model
mlp_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=100, solver='adam', random_state=42)
mlp_model.fit(X_train, y_train)

# Make predictions
y_train_pred = mlp_model.predict(X_train)
y_test_pred = mlp_model.predict(X_test)

# Evaluate on training data
training_scores(y_train, y_train_pred)

# Evaluate on testing data
validation_scores(y_test, y_test_pred)


Training Scores:
	Accuracy = 0.422
	Precision = 0.702
	Recall = 0.298
	F1-Score = 0.419
Testing Scores:
	Accuracy = 0.419
	Precision = 0.696
	Recall = 0.296
	F1-Score = 0.416


In [40]:
from sklearn.neural_network import MLPClassifier

# DBN can be implemented using MLP (multi-layer perceptron with multiple hidden layers)
dbn_model = MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=100, solver='adam', random_state=42)
dbn_model.fit(X_train, y_train)

# Make predictions
y_train_pred = dbn_model.predict(X_train)
y_test_pred = dbn_model.predict(X_test)

# Evaluate on training data
training_scores(y_train, y_train_pred)

# Evaluate on testing data
validation_scores(y_test, y_test_pred)




Training Scores:
	Accuracy = 0.698
	Precision = 0.698
	Recall = 1.0
	F1-Score = 0.822
Testing Scores:
	Accuracy = 0.698
	Precision = 0.698
	Recall = 1.0
	F1-Score = 0.822


In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from tensorflow.keras.optimizers import Adam

# Build the CNN model
cnn_model = Sequential()
cnn_model.add(Conv1D(64, 2, activation='relu', input_shape=(X_train.shape[1], 1)))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dense(1, activation='sigmoid'))

cnn_model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

# Reshape data for CNN (adding a single feature channel)
X_train_reshaped = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test_reshaped = X_test.values.reshape(-1, X_test.shape[1], 1)

# Train the model
cnn_model.fit(X_train_reshaped, y_train, epochs=10, batch_size=32, validation_data=(X_test_reshaped, y_test))

# Make predictions
y_train_pred = (cnn_model.predict(X_train_reshaped) > 0.5).astype(int)
y_test_pred = (cnn_model.predict(X_test_reshaped) > 0.5).astype(int)

# Evaluate on training data
training_scores(y_train, y_train_pred)

# Evaluate on testing data
validation_scores(y_test, y_test_pred)


ModuleNotFoundError: No module named 'tensorflow'