# Indirect social influence helps shaping the diffusion of innovations

## Analysis of human decisions

### Libraries import

In [7]:
import numpy as np
import os
import pandas as pd
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

### Prediction of human decision using Random Forest clasifier

#### Dependent variable: color chosen (binary). 1: innovation color chosen, 0: no innovation color chosen
#### Independent variables: "Innovation as initial color" (binary), "Innovation as majority color seen" (binary), "Percentage of first neighbors with the innovation color" (float in [0,1]), "Percentage of n-distance neighbors with the innovation color" (float in [0,1])

In [8]:
# random seed
seed=1123

# Load the file as a dataframe
data = pd.read_csv("Unified_data_edited_innov3_classification.csv")

#Filter the dataset for bot=0 and Setup different than "instructions"
data = data[(data['bot'] == 0) & (data['Setup'] != 'instructions')]

misLabels=["Setting I","Setting II","Setting III","Setting IV"]
misFeatureLabels=["Innovation as initial color", "Innovation as majority color seen", "Percentage of first neighbors with the innovation color", "Percentage of n-distance neighbors with the innovation color"]

# Keep just the columns: innovationAsInitialcolor, InnovationAsMajorityColorSeen, perc_color_neigbors, perc_color_friends, Adopted_color
data = data[['InnovationAsInitialcolor', 'InnovationAsMajorityColorSeen', 'perc_color_neigbors', 'perc_color_friends', 'Adopted_color']]


# Create a Random Forest Classifier to predict the "Adopted_color" column using the other columns as features
# Use two nested cross validation loops, one to tune the hyperparameters and the other to evaluate the model

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Create the Random Forest Classifier
rf = RandomForestClassifier()


# Perform grid search to find the best hyperparameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(data.drop('Adopted_color', axis=1), data['Adopted_color'])

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best hyperparameters to create the final model
final_model = RandomForestClassifier(**best_params)

# Perform nested cross validation to evaluate the model
nested_scores = cross_val_score(final_model, data.drop('Adopted_color', axis=1), data['Adopted_color'], cv=5)

# Print the average accuracy of the nested cross validation
print("Nested Cross Validation Accuracy:", np.mean(nested_scores))


# Fit the model
final_model.fit(data.drop('Adopted_color', axis=1), data['Adopted_color'])

#Save the model
joblib.dump(final_model, "final_model.pkl")


Best Hyperparameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Nested Cross Validation Accuracy: 0.8444024681470028


['final_model.pkl']