In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('mountains_vs_beaches_preferences.csv')

# Encode categorical variables
label_encoder = LabelEncoder()
categorical_columns = ['Gender', 'Education_Level', 'Preferred_Activities', 'Location', 'Favorite_Season']
for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])

# Define features and target
features = data.drop('Preference', axis=1)
target = data['Preference']

# Split the data into training and testing sets with 80% of the data for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Detailed performance evaluation
print("Classification Report:")
print(classification_report(y_test, predictions))
print("Accuracy Score:", accuracy_score(y_test, predictions))
print("Precision Score:", precision_score(y_test, predictions))
print("Recall Score:", recall_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions))

# Displaying actual vs predicted in a DataFrame for detailed analysis
results_df = X_test.copy()
results_df['Actual'] = y_test
results_df['Predicted'] = predictions
results_df['Correct'] = results_df['Actual'] == results_df['Predicted']

# Optionally, print some examples of mismatches
mismatch_examples = results_df[results_df['Correct'] == False].head()
print(mismatch_examples[['Actual', 'Predicted']])
print(mismatch_examples.head())

# Analyzing feature importance
feature_importance = model.feature_importances_
importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': feature_importance})
print(importance_df.sort_values(by='Importance', ascending=False))


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7851
           1       1.00      0.99      0.99      2638

    accuracy                           1.00     10489
   macro avg       1.00      0.99      0.99     10489
weighted avg       1.00      1.00      1.00     10489

Accuracy Score: 0.9958051291829536
Precision Score: 0.9965543644716692
Recall Score: 0.986732373009856
F1 Score: 0.9916190476190476
       Actual  Predicted
29695       1          0
24626       1          0
25730       1          0
31863       1          0
48335       1          0
       Age  Gender  Income  Education_Level  Travel_Frequency  \
29695   51       0   74354                2                 1   
24626   53       2   40621                0                 9   
25730   48       1   59610                0                 8   
31863   48       2   65749                1                 0   
48335   38       1  108095                3

Unnamed: 0,Preference
0,1
1,0
2,1
3,1
4,0
...,...
52439,1
52440,0
52441,0
52442,0
