In [1]:
from azureml.core import Workspace
ws = Workspace.from_config()



In [2]:
train_data = {"Container":"azureml-blobstore-a762bf81-9d4c-409e-8026-266ab98acd39",
"SasToken":None,
"Uri":"wasbs://azureml-blobstore-a762bf81-9d4c-409e-8026-266ab98acd39@sawweictddsplayaml.blob.core.windows.net/azureml/7c6f5359-70b8-4905-aca9-b81086805c51/train_data",
"Account":"sawweictddsplayaml",
"RelativePath":"azureml/7c6f5359-70b8-4905-aca9-b81086805c51/train_data",
"PathType":0,
"AmlDataStoreName":"workspaceblobstore"}

In [3]:
datastore = ws.datastores[train_data["AmlDataStoreName"]]


In [7]:
from azureml.core import Dataset

datastore_paths = [(datastore, train_data["RelativePath"]+"/*.csv")]
datastore_paths


[({
    "name": "workspaceblobstore",
    "container_name": "azureml-blobstore-a762bf81-9d4c-409e-8026-266ab98acd39",
    "account_name": "sawweictddsplayaml",
    "protocol": "https",
    "endpoint": "core.windows.net"
  },
  'azureml/7c6f5359-70b8-4905-aca9-b81086805c51/train_data/*.csv')]

In [22]:
train_data = Dataset.Tabular.from_delimited_files(path=datastore_paths)

In [23]:
df = train_data.to_pandas_dataframe()


In [97]:
from sklearn.model_selection import train_test_split

def stratified_split(df, label="target"):
    X = df.drop(columns=[label])
    y = df[label]
    y = y.map({"yes":1, "no":0})
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = 99)


    return X_train,X_test , y_train, y_test



In [79]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression


df.columns


Index(['city', 'city_development_index', 'gender', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'company_size', 'company_type', 'last_new_job',
       'training_hours', 'target'],
      dtype='object')

In [69]:
from sklearn.base import TransformerMixin

class StringCaster(TransformerMixin):
    def fit(self,X,y=None):
        return self

    def transform(self,X,y=None):
        return X.astype(str)
    



In [86]:
numeric_features = ['city_development_index', 'training_hours']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['city',  'gender', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'company_size', 'company_type', 'last_new_job']

categorical_transformer = Pipeline(steps=[
    ('caster', StringCaster()),
    ('encoder', OneHotEncoder(handle_unknown='ignore')),
    ]
)


In [87]:
numeric_transformer.fit_transform(df[numeric_features])

array([[ 0.65884494, -1.02360785],
       [ 0.21359685, -0.34022018],
       [-1.65644512,  0.04314363],
       ...,
       [-0.42594131,  4.41015747],
       [ 0.47265029, -0.44022813],
       [ 0.73979914, -0.87359592]])

In [88]:
categorical_transformer.fit_transform( df[categorical_features])

<15326x191 sparse matrix of type '<class 'numpy.float64'>'
	with 153260 stored elements in Compressed Sparse Row format>

In [107]:
from sklearn.metrics import accuracy_score, roc_auc_score, matthews_corrcoef


preprocessor = ColumnTransformer(
    transformers=[ 
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(max_iter=1000, C=0.5))])

X_train, X_test, y_train, y_test = stratified_split(df)

clf.fit(X_train, y_train)
y_pred_score = clf.decision_function(X_test)
y_pred = clf.predict(X_test)

print("roc_auc: %.3f" % roc_auc_score(y_test, y_pred_score))

roc_auc: 0.795


In [108]:


print("accuracy: %.3f" % accuracy_score(y_test, y_pred))

accuracy: 0.777


In [111]:
print("mcc: %.3f" % matthews_corrcoef(y_test, y_pred))

mcc: 0.322


In [113]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.81      0.92      0.86      2301
           1       0.59      0.34      0.43       765

    accuracy                           0.78      3066
   macro avg       0.70      0.63      0.65      3066
weighted avg       0.75      0.78      0.75      3066



In [114]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_pred))

[[2125  176]
 [ 508  257]]
