In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [4]:
df =  pd.read_csv("dataset.csv")

#Feature and target
X= df.drop("converted", axis=1)
y=  df["converted"]

# Preprocess columns
categories =  ["funding_stage", "industry", "location"]
numerical  =  ["employees"]
binary = []
 
# Create a Binary features

X['linkedin_present'] = X["linkedin_url"].apply(lambda x: 1 if isinstance(x, str) and x!= " " else 0 )
X["professional_email"] = X['email'].apply(lambda x: 0 if str(x).endswith(("gmail.com", "yahoo.com")) else 1)
binary  =  ["linkedin_present", "professional_email"]


# Drop Unused column
X= X.drop(["company", "email", "linkedin_url"], axis=1)

# preprocess pipline
preprocess = ColumnTransformer([
    ("cat", make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder(handle_unknown="ignore")
    ), categories),

    ("num", make_pipeline(
        SimpleImputer(strategy="median"),
        StandardScaler()
    ), numerical),

    ("bin", "passthrough", binary)
])
model  =  Pipeline(steps=[("Preprocessor", preprocess), ("Classifier", RandomForestClassifier(
    max_depth=None,
    n_estimators=300,
    random_state=42,
    class_weight = "balanced"))
    ]
                   
)

# Train /Test split
# X_train , X_test, y_train, y_test =  train_test_split(X, y , test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# train Model
model.fit(X_train,y_train)

#evaluate
y_pred  = model.predict(X_test)
accuracy =  accuracy_score(y_test, y_pred)
print(f"Model trained with accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

joblib.dump(model, "model.pkl")
print("model Saved Sucessfully")


Model trained with accuracy: 1.00

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         3

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4

model Saved Sucessfully
