In [1]:
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import os

# Define paths
input_file_path = "/Users/Joe/Downloads/DATASCIENCERPOJECT/processed_data.csv"
output_dir = "/Users/Joe/Downloads/DATASCIENCERPOJECT/resultats"

# Load the dataset
data = pd.read_csv(input_file_path)

# Clean the 'records' column by removing non-numeric characters and converting to float
data['records'] = data['records'].replace(r'[^0-9.]', '', regex=True).astype(float)

# Encode categorical columns to numeric values
label_encoder = LabelEncoder()
for col in ['company', 'organization_type', 'method']:
    data[col] = label_encoder.fit_transform(data[col])

# Define features and target
X = data.drop("method", axis=1)
y = data["method"]

# Apply SMOTE with k_neighbors=1 to avoid sampling errors in small classes
smote = SMOTE(random_state=42, k_neighbors=1)
X_res, y_res = smote.fit_resample(X, y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion_mat)

# Save results to the specified directory
if not os.path.exists(output_dir):
    os.makedirs(output_dir)  # Create the directory if it doesn't exist

# Save accuracy, classification report, and confusion matrix to files
accuracy_file = os.path.join(output_dir, "accuracy.txt")
classification_file = os.path.join(output_dir, "classification_report.txt")
confusion_file = os.path.join(output_dir, "confusion_matrix.csv")

# Save accuracy
with open(accuracy_file, "w") as f:
    f.write(f"Accuracy: {accuracy}\n")

# Save classification report
with open(classification_file, "w") as f:
    f.write("Classification Report:\n")
    f.write(classification_rep)

# Save confusion matrix
confusion_df = pd.DataFrame(confusion_mat)
confusion_df.to_csv(confusion_file, index=False)

print("The results have been saved in the directory:", output_dir)


Accuracy: 0.8798037612428454
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96       151
           1       0.68      0.49      0.57       148
           2       0.94      0.95      0.95       133
           3       0.77      0.74      0.76       141
           4       0.77      0.92      0.84       131
           5       0.86      0.94      0.90       123
           6       0.97      0.97      0.97       127
           7       0.96      0.98      0.97       130
           8       1.00      0.99      1.00       139

    accuracy                           0.88      1223
   macro avg       0.88      0.88      0.88      1223
weighted avg       0.87      0.88      0.87      1223

Confusion Matrix:
 [[147   3   0   0   1   0   0   0   0]
 [  6  73   4  26  23  12   3   1   0]
 [  0   2 127   1   2   1   0   0   0]
 [  2  18   1 105   5   6   1   3   0]
 [  0   9   0   1 120   0   0   1   0]
 [  0   2   0   3   2 116   