In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import LabelEncoder

### Data Preprocessing

In [None]:
data =pd.read_csv('data_640_validated.csv', encoding='latin-1')
data

In [None]:
#Inaccurate or invalide data
replace_dict={'30s':'30','sub 28':'28'}
data['A5 (Age)']=data['A5 (Age)'].replace(replace_dict)

In [None]:
#Incorrect Data Type
data['A5 (Age)']=data['A5 (Age)'].astype('int64')

In [None]:
#Renaming the column Name
new_column_name= {'Unnamed: 0':'ID','ï..O1':'Nationality'}
data.rename(columns=new_column_name,inplace=True)

### Age Distribution EDA

In [None]:
#Age Distribution with Pie chart
bins=[11,18,30,40,50,60]
data['AgeCategory']=pd.cut(data['A5 (Age)'],bins=bins,labels=['11-18','19-30','31-40','41-50','51-60'])
age_distribution=data['AgeCategory'].value_counts()
plt.pie(age_distribution,labels=age_distribution.index,autopct='%1.1f%%',startangle=180)
plt.title('Age Distributions of Players')
plt.show()

### Sex and Environmental Perception

In [None]:
columns_to_add=['A1_1','A1_2','A2 (Sex)','A3','A4','A5 (Age)','A6','A7','A8','C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14','C15']
new_df= data[columns_to_add].copy()
new_df

### Identify the most important socio demographic relationship with Environmental Perception

In [None]:
median=new_df[['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14','C15']].median(axis=1)
new_df['T_Avg']=median
new_df

In [None]:
label_encoder=LabelEncoder()
new_df['A1_2']=label_encoder.fit_transform(new_df['A1_2'])
new_df['A2 (Sex)'] =label_encoder.fit_transform(new_df['A2 (Sex)'])
new_df['A3'] = label_encoder.fit_transform(new_df['A3'])
new_df['A4']=label_encoder.fit_transform(new_df['A4'])
new_df['A6']=label_encoder.fit_transform(new_df['A6'])
new_df['A7']=label_encoder.fit_transform(new_df['A7'])
new_df['A8']=label_encoder.fit_transform(new_df['A8'])


In [None]:
X=new_df[['A1_2','A2 (Sex)','A3','A4','A5 (Age)','A6','A7','A8']]
Y= new_df['T_Avg']

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif,chi2

chi2_features = SelectKBest(chi2, k = 'all') 
X_kbest_features = chi2_features.fit_transform(X, Y)

chi2_scores = chi2_features.scores_

for i in range (len(chi2_scores)):
    print('Feature %d: %f' %(i,chi2_scores[i]))

plt.bar([i for i in range (len(chi2_scores))],chi2_scores,color='green')
plt.xticks([0,1,2,3,4,5,6,7], X.columns)
plt.xlabel('Socio-Demographic Variable')
plt.ylabel('Chi-Square Scores')
plt.show()

### Human Centered Environmental Perception for Gender

In [None]:
c=data[['A2 (Sex)','C2','C4','C6','C8','C10','C12','C14']]
group_sex= c.groupby(by='A2 (Sex)').mean()
a=group_sex[['C2','C4','C6','C8','C10','C12','C14']].mean(axis=1)
mean_human= pd.DataFrame({'Sex':a.index,'Mean':a.values})
mean_human


In [None]:
fig= px.bar(mean_human,x='Sex',y='Mean',text_auto=True,title='Environmental Perception for Gender with Human Centeredness')
fig.update_layout(yaxis_title='Mean',bargap=0.8)
fig.show()

### Non human Centered Environmental Perception for Gender

In [None]:
c=data[['A2 (Sex)','C1','C3','C5','C7','C9','C10','C11','C13','C15']]
group_sex= c.groupby(by='A2 (Sex)').mean()
a=group_sex[['C1','C3','C5','C7','C9','C10','C11','C13','C15']].mean(axis=1)
mean_non_human= pd.DataFrame({'Sex':a.index,'Mean':a.values})
mean_non_human

In [None]:
fig= px.bar(mean_non_human,x=mean_non_human['Sex'],y=mean_non_human['Mean'],text_auto=True,title='Environmental Perception for Gender with Non Human Centeredness')
fig.update_layout(yaxis_title='Mean',bargap=0.8)
fig.show()

### Frequency of gender cutting down the tree

In [None]:
columns_for_trees=['A2 (Sex)','E17']
data_1= data[columns_for_trees].copy()
data_1

In [None]:
male_df = data_1[(data_1['A2 (Sex)'] == 'Male')]
grouped_df = male_df.groupby('E17').size().reset_index(name='Count')

plt.figure(figsize=(8, 8))
plt.pie(grouped_df['Count'], labels=grouped_df['E17'], autopct='%1.1f%%', startangle=140)
plt.title('Distribution of E17 Values for Males')
plt.show()

In [None]:
female_df = data_1[(data_1['A2 (Sex)'] == 'Female')]
grouped_df = female_df.groupby('E17').size().reset_index(name='Count')
plt.figure(figsize=(8, 8))
plt.pie(grouped_df['Count'], labels=grouped_df['E17'], autopct='%1.1f%%', startangle=140)
plt.title('Distribution of E17 Values for Female')
plt.show()

In [None]:
gender_counts = data['A2 (Sex)'].value_counts()
gender_counts

In [None]:
sns.countplot(x=data['E17'], hue=data['A2 (Sex)'],data=data)
plt.show()

### Label Encoding for new_df dataframe

In [None]:
#Label Encoding
label_encoder=LabelEncoder()
new_df['A1_1']=label_encoder.fit_transform(new_df['A1_1'])
new_df['A1_2']=label_encoder.fit_transform(new_df['A1_2'])
new_df['A2 (Sex)'] =label_encoder.fit_transform(new_df['A2 (Sex)'])
new_df['A3'] = label_encoder.fit_transform(new_df['A3'])
new_df['A4']=label_encoder.fit_transform(new_df['A4'])
new_df['A6']=label_encoder.fit_transform(new_df['A6'])
new_df['A7']=label_encoder.fit_transform(new_df['A7'])
new_df['A8']=label_encoder.fit_transform(new_df['A8'])
new_df.head()

### Correlation Matrix

In [None]:
a= new_df.corr()
plt.figure(figsize=(15,6))
sns.heatmap(a,fmt='.2f',annot=True)
plt.show()

#### Human-centered data 

In [None]:
#Human centered dataframe
new_df_human= new_df[['A1_1','A1_2','A2 (Sex)','A3','A4','A5 (Age)','A6','A7','A8','C2','C4','C6','C8','C10','C12','C14']]
new_df_human

In [None]:
column_human=['C2','C4','C6','C8','C10','C12','C14']
row_means=new_df_human[column_human].median(axis=1)
new_df_human['mean_human']= row_means
new_df_human

### Non Human Centeredness

In [None]:
new_df_non_human=new_df[['A1_1','A1_2','A2 (Sex)','A3','A4','A5 (Age)','A6','A7','A8','C1','C3','C5','C7','C9','C11','C13','C15']]
new_df_non_human

In [None]:
column_non_human=['C1','C3','C5','C7','C9','C11','C13','C15']
row_means=new_df_non_human[column_non_human].median(axis=1)
new_df_non_human['mean_non_human']= row_means
new_df_non_human

In [None]:
selected_columns_df1 = new_df[['A1_1','A1_2','A2 (Sex)','A3','A4','A5 (Age)','A6','A7','A8']]
selected_columns_df2 = new_df_human[['mean_human']]
selected_columns_df3= new_df_non_human[['mean_non_human']]
final_df= pd.concat([selected_columns_df1,selected_columns_df2,selected_columns_df3],axis=1)
final_df

In [None]:
final_df['t_mean']= final_df[['mean_human','mean_non_human']].mean(axis=1)
final_df

In [None]:
m=[]
for i,rows in final_df.iterrows():
    if(rows['t_mean']<=3):
        m.append('-1')
    if(rows['t_mean']>3):
        m.append('+1')
final_df['encode']=m

final_df['encode'].value_counts()



### Model Selection

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.datasets import make_classification

# Assuming you have X and y loaded
# For simplicity, creating a synthetic dataset
X=final_df[['A1_2','A2 (Sex)','A3','A4','A5 (Age)','A6','A7','A8']]
y= final_df[['encode']]

# Split the data into training and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_eval, y_test, y_eval = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create instances of different Naive Bayes models
models = {
    'Gaussian Naive Bayes': GaussianNB(),    
     'Multinomial Naive Bayes': MultinomialNB(),
    'Bernoulli Naive Bayes': BernoulliNB()
}

# Evaluate models using cross-validation
for model_name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=10, scoring='accuracy')
    print(f"{model_name} - Cross-Validation Scores: {cv_scores}, Mean Accuracy: {np.mean(cv_scores)}")

# Select the model with the highest mean accuracy
best_model_name = max(models, key=lambda k: np.mean(cross_val_score(models[k], X_train, y_train, cv=5, scoring='accuracy')))
best_model = models[best_model_name]

# Train the selected model on the entire training set
best_model.fit(X_train, y_train)

# Evaluate the selected model on the test set
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy of the Best Model ({best_model_name}): {test_accuracy}")

### Model Evaluation

In [None]:
from sklearn.metrics import classification_report

# Assuming you have X_test and y_test

# Evaluate the selected model on the test set
y_pred = best_model.predict(X_test)

# Generate the classification report
report = classification_report(y_test, y_pred)

# Print the classification report
print("Classification Report:")
print(report)