In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

df = pd.read_csv('fake_data.csv')
trainingdata = df.drop(columns=['Project_ID', 'Project_Name', 'At_Risk'])

def additional_values(df):

    df['Budget_Deviation'] = df['%_Budget_Used'] - df['%_Work_Completed']
    df['Timeline_Deviation'] = df['%_Timeline_Completed'] - df['%_Work_Completed']
    df['Workload_Per_Team_Member'] = df['Estimated_Hours'] / (df['Team_Size'] * df['Timeline_Days'])
    df['Workload_Feature'] = df['Estimated_Hours'] / df['Timeline_Days']
    df['Team_Size_Adjusted_Workload'] = df['Workload_Feature'] / df['Team_Size']
    return df

def preprocessing(df):

    num_cols = ['Budget', 'Estimated_Hours', 'Timeline_Days', 'Team_Size', '%_Budget_Used', '%_Work_Completed', '%_Timeline_Completed', 'Budget_Deviation', 'Timeline_Deviation','Workload_Per_Team_Member', 'Workload_Feature', 'Team_Size_Adjusted_Workload']
    num_pipeline = Pipeline([("impuder", SimpleImputer(strategy='mean')), ("scale", RobustScaler())])

# Combine with ColumnTransformer
    preprocess = ColumnTransformer(transformers=[
        ('num_vals', num_pipeline, num_cols)]) 
    #transform the data
    x_transformed = preprocess.fit_transform(df)

    all_columns = num_cols

    transformed_df = pd.DataFrame(x_transformed, columns=all_columns)

    return transformed_df, preprocess


new_df = additional_values(trainingdata)
transformed_df, preprocess =preprocessing(new_df)
transformed_df['At_Risk'] = df['At_Risk'].values
X = transformed_df.drop(columns=['At_Risk'])
y = transformed_df['At_Risk']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
model = RandomForestClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

# Example single project data (replace with actual values)
new_project = {
    "Budget": 100000,
    "Estimated_Hours": 500,
    "Timeline_Days": 100,
    "Team_Size": 50,
    "%_Budget_Used": 0.5,
    "%_Work_Completed": 0.5,
    "%_Timeline_Completed": 0.5
}

new_project_df = pd.DataFrame([new_project])
new_project_df
new_project_w_adds = additional_values(new_project_df)
new_project_transformed,preprocess_2=preprocessing(new_project_w_adds)
new_project_transformed
risk_prediction = model.predict(new_project_transformed)
risk_probability = model.predict_proba(new_project_transformed)

print(f"Risk Prediction: {'At Risk' if risk_prediction[0] == 1 else 'Not At Risk'}")
print(f"Probability of Being At Risk: {risk_probability[0][1]:.2f}")
