### ML

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder # Import OneHotEncoder
import pandas as pd
import joblib


# Define features and target
X = df.drop(columns=["Online Safety"])
y = df["Online Safety"]

# Create a LabelEncoder for categorical features (if needed for y)
# If 'Online Safety' is categorical, use LabelEncoder on it
if pd.api.types.is_string_dtype(y):
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Separate numerical and categorical features
numerical_features = X_train.select_dtypes(include=['number']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Scale numerical features using StandardScaler
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[numerical_features])
X_test_num = scaler.transform(X_test[numerical_features])

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparse=False for array output
X_train_cat = encoder.fit_transform(X_train[categorical_features])
X_test_cat = encoder.transform(X_test[categorical_features])

# Combine scaled numerical and encoded categorical features
X_train = pd.concat([pd.DataFrame(X_train_num, index=X_train.index, columns=numerical_features),
                     pd.DataFrame(X_train_cat, index=X_train.index, columns=encoder.get_feature_names_out(categorical_features))], axis=1)
X_test = pd.concat([pd.DataFrame(X_test_num, index=X_test.index, columns=numerical_features),
                    pd.DataFrame(X_test_cat, index=X_test.index, columns=encoder.get_feature_names_out(categorical_features))], axis=1)


# Initialize and train the Logistic Regression model with increased max_iter
logreg = LogisticRegression(max_iter=1000, solver='saga')
logreg.fit(X_train, y_train)

# Predict on test data
y_pred = logreg.predict(X_test)
joblib.dump(dt, 'logistic_tree_model.pkl')
# Evaluate the model
print("Logistic Regression - Accuracy:", accuracy_score(y_test, y_pred))

Logistic Regression - Accuracy: 0.6363636363636364


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize and train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on test data
y_pred = rf.predict(X_test)

# Evaluate the model
print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Random Forest - Accuracy: 0.6363636363636364
Confusion Matrix:
 [[0 2 0]
 [0 4 1]
 [0 1 3]]
Classification Report:
               precision    recall  f1-score   support

    Moderate       0.00      0.00      0.00         2
    Not Safe       0.57      0.80      0.67         5
        Safe       0.75      0.75      0.75         4

    accuracy                           0.64        11
   macro avg       0.44      0.52      0.47        11
weighted avg       0.53      0.64      0.58        11



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# prompt: i want to run a decision tree model on the same

import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder


# Define features (X) and target (y)
X = df.drop(columns=["Online Safety"])
y = df["Online Safety"]

# Encode categorical features
categorical_features = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for feature in categorical_features:
    X[feature] = encoder.fit_transform(X[feature])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)  # You can adjust hyperparameters here
dt.fit(X_train, y_train)

# Make predictions on the test set
y_pred = dt.predict(X_test)

# Evaluate the model
print("Decision Tree - Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

joblib.dump(dt, 'decision_tree_model.pkl')

Decision Tree - Accuracy: 0.5454545454545454
Confusion Matrix:
 [[1 1 0]
 [0 3 2]
 [0 2 2]]
Classification Report:
               precision    recall  f1-score   support

    Moderate       1.00      0.50      0.67         2
    Not Safe       0.50      0.60      0.55         5
        Safe       0.50      0.50      0.50         4

    accuracy                           0.55        11
   macro avg       0.67      0.53      0.57        11
weighted avg       0.59      0.55      0.55        11



['decision_tree_model.pkl']