In [2]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pickle
from sklearn.ensemble import GradientBoostingClassifier

# Load the dataset
df = pd.read_csv("C:\\Users\\Dell\\OneDrive\\Desktop\\survey.csv") 

# Drop unnecessary columns
df.drop(['comments', 'state', 'Country', 'Timestamp'], axis=1, inplace=True)

# Standardize gender values
df['Gender'].replace([
    'male', 'M', 'm', 'Make', 'Man', 'Mair', 'Guy (-ish) ^_^',
    'male leaning androgynous', 'Cis Man', 'msle', 'cis male', 'Mail',
    'Androgyne', 'Male (CIS)', 'Male-ish', 'maile', 'something kinda male?',
    'Mal', 'ostensibly male, unsure what that really means', 'Malr',
    'Cis Male', 'Male '
], 'Male', inplace=True)

df['Gender'].replace([
    'female', 'F', 'f', 'Female ', 'Female (cis)', 'Femail',
    'Femake', 'woman', 'Cis Female', 'femail', 'Woman',
    'cis-female/femme'
], 'Female', inplace=True)

df['Gender'].replace([
    'Trans-female', 'queer/she/they', 'non-binary', 'Nah', 'All', 'Enby', 
    'fluid', 'Genderqueer', 'Agender', 'Trans woman', 'Neuter', 
    'Female (trans)', 'queer'
], 'Other', inplace=True)

# Remove invalid gender entries
invalid_gender_entries = ['A little about you', 'p']
df = df[~df['Gender'].isin(invalid_gender_entries)]

# Handle outliers and invalid age values
mean_age = df[(df['Age'] <= 75) & (df['Age'] >= 18)]['Age'].mean()
invalid_ages = [99999999999, 329, -29, -1726, 5, 11]
df['Age'].replace(invalid_ages, mean_age, inplace=True)

# Fill missing values
df['self_employed'].fillna('No', inplace=True)
df['work_interfere'].fillna('Not Sure', inplace=True)

# Define categorical columns
categorical_cols = [
    'Gender', 'self_employed', 'family_history', 'work_interfere', 
    'no_employees', 'remote_work', 'tech_company', 'benefits', 
    'care_options', 'wellness_program', 'seek_help', 'anonymity', 
    'leave', 'mental_health_consequence', 'phys_health_consequence', 
    'coworkers', 'supervisor', 'mental_health_interview', 
    'phys_health_interview', 'mental_vs_physical', 'obs_consequence'
]

# Feature matrix (X) and target variable (y)
X = df[categorical_cols + ['Age']]  # Include 'Age' in X
y = df['treatment']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define ColumnTransformer for encoding categorical features and scaling numerical features
transformer = ColumnTransformer([
    ('ordinal_encoder', OrdinalEncoder(), categorical_cols),
    ('minmax_scaler', MinMaxScaler(), ['Age'])  # Scale the 'Age' column
])

# Label encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Initialize the RandomForestClassifier
gradient_boost = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1)

# Create a pipeline that combines the transformer and the classifier
pipeline = Pipeline([
    ('transformer', transformer),
    ('classifier', gradient_boost)
])

print(X_train.columns)
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train_encoded)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred)
print("gradient_boost Accuracy:", accuracy)

# Save the pipeline to a pickle file
with open('gradient_boost_model.pkl', 'wb') as model_file:
    pickle.dump(pipeline, model_file)


Index(['Gender', 'self_employed', 'family_history', 'work_interfere',
       'no_employees', 'remote_work', 'tech_company', 'benefits',
       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'Age'],
      dtype='object')
gradient_boost Accuracy: 0.7976190476190477
