Copyright (c) Microsoft Corporation.

Licensed under the MIT License.

In [None]:
data_lake_account_name = ''
file_system_name = ''

subscription_id = "" 
resource_group = "" 
workspace_name = "" 
workspace_region = ""

In [None]:
import azureml.core
from azureml.core import Workspace, Experiment
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import logging
import os

from azureml.core.model import Model
from azureml.core import Environment
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice


from azureml.core import Workspace
ws = Workspace(workspace_name = workspace_name,
               subscription_id = subscription_id,
               resource_group = resource_group)

In [None]:
df_mapping = spark.read.format("csv").load(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/prepareddata/train",header=True,escape ='"',multiLine=True)
df_mapping = df_mapping[['product','issue']].toPandas().drop_duplicates().sort_values(by='product')

df_mapping.to_dict('records')
df_mapping['product'].unique()


df_train = spark.read.format("csv").load(f"abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/prepareddata/train",header=True,escape ='"',multiLine=True)
df_train.dtypes
df_train = df_train.select('issue','complaint')
df_train.write.option('header', 'true').mode('overwrite').csv(f'abfss://{file_system_name}@{data_lake_account_name}.dfs.core.windows.net/prepareddata/trainautoml/')

df_train = df_train.toPandas()
df_train['issue'].value_counts()

In [None]:
df_train['issue'].value_counts()

In [None]:
import pandas as pd


df_train.loc[:,'issue_id'] = pd.factorize(df_train['issue'])[0]
df_train

n_issues = len(df_train.issue.unique())
print("n_issues:", n_issues)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(df_train.issue)

transformer = TfidfTransformer(smooth_idf=False)

tfidf = transformer.fit_transform(counts)

tfidf.shape

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

test_size = max(int(np.floor(df_train.shape[0]*.1)), 2*n_issues)
train_size = int((df_train.shape[0] - test_size) * 1.0)

X_train, X_test, y_train, y_test = train_test_split(tfidf, 
                                                    df_train.issue, 
                                                    train_size=train_size, 
                                                    test_size=test_size,
                                                    stratify=df_train.issue)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

# larger C decreases runtime, but decreases performance

C = 0.01
print("C: %s" % C)
clf = LogisticRegression(C=C,
                         multi_class='multinomial',
                         penalty='l1', solver='saga', tol=0.2)

clf.fit(X_train, y_train)

sparsity = np.mean(clf.coef_ == 0) * 100
score = clf.score(X_test, y_test)
print("Sparsity with L1 penalty: %.2f%%" % sparsity)
print("Test score with L1 penalty: %.4f" % score)
print("Chance performance is: %.4f" % (1.0/y_train.unique().shape[0]))
print()

In [None]:
from sklearn.metrics import precision_score

y_test_pred = clf.predict(X_test)

macro_prec = precision_score(y_test, y_test_pred, average='macro')
micro_prec = precision_score(y_test, y_test_pred, average='micro')

print('Avg Precision Micro: %03f, Macro: %03f' % (micro_prec, macro_prec))

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('vectorizer', vectorizer), ('transformer', transformer), ('scaler', scaler), ('lr', clf)])
pipeline.set_params(transformer__smooth_idf=False, scaler__with_mean=False, lr__C=.01) 

In [None]:
pipeline.fit(df_train.complaint, df_train.issue)

In [None]:
pipeline.score(df_train.complaint, df_train.issue)

In [None]:
import joblib

model_filename = 'ccmmodel'

joblib.dump(value=pipeline, filename=model_filename)

In [None]:
from azureml.core.model import Model
model_path = 'ccmmodel'
model_name = "ccmmodel"
registered_model = Model.register(model_path = model_path, # this points to a local file
                       model_name = model_name, # name the model is registered as
                       tags = {'type': "classification"}, 
                       description = "Complaints Classifier", 
                       workspace = ws)


In [None]:
import pandas as pd
i = 100
test_df = pd.DataFrame({'complaint' : [df_train.complaint.values[i]], 'issue': [df_train.issue.values[i]]})
probs = pipeline.predict_proba(test_df.complaint).flatten()
top_categories = np.argsort(-probs)[:5]
top_tpis = pipeline.classes_[top_categories].tolist()

print('complaint:', test_df.complaint.values[0])
print('ground truth:', test_df.issue.values)
print('top 5 tpis:', top_tpis)
print("correct: ", test_df.issue.values in top_tpis)

In [None]:
scoring_script = """
import os
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from azureml.core.model import Model
#from sklearn.externals import joblib
import joblib

def init():
    global pipeline
    model_path = Model.get_model_path(model_name = 'ccmmodel')
    pipeline = joblib.load(model_path)
    
def run(input_json):
    mappinglist = [{'product': 'Banking Services', 'issue': 'Managing an account'}, {'product': 'Banking Services', 'issue': 'Opening an account'}, {'product': 'Banking Services', 'issue': 'Incorrect information on your report'}, {'product': 'Banking Services', 'issue': 'Unable to get your credit report or credit score'}, {'product': 'Banking Services', 'issue': 'Credit monitoring or identity theft protection services'}, {'product': 'Banking Services', 'issue': 'Closing an account'}, {'product': 'Banking Services', 'issue': 'Problem caused by your funds being low'}, {'product': 'Banking Services', 'issue': 'Problem with a lender or other company charging your account'}, {'product': 'Card Services', 'issue': 'Other features, terms, or problems'}, {'product': 'Card Services', 'issue': 'Closing your account'}, {'product': 'Card Services', 'issue': 'Advertising and marketing, including promotional offers'}, {'product': 'Card Services', 'issue': 'Problem with a purchase shown on your statement'}, {'product': 'Card Services', 'issue': 'Credit monitoring or identity theft protection services'}, {'product': 'Card Services', 'issue': 'Unable to get your credit report or credit score'}, {'product': 'Card Services', 'issue': 'Incorrect information on your report'}, {'product': 'Card Services', 'issue': 'Trouble using your card'}, {'product': 'Card Services', 'issue': "Problem with a credit reporting company's investigation into an existing problem"}, {'product': 'Card Services', 'issue': 'Improper use of your report'}, {'product': 'Card Services', 'issue': 'Problem when making payments'}, {'product': 'Card Services', 'issue': 'Getting a credit card'}, {'product': 'Card Services', 'issue': 'Fees or interest'}, {'product': 'Credit Reporting', 'issue': 'Credit monitoring or identity theft protection services'}, {'product': 'Credit Reporting', 'issue': 'Getting a loan or lease'}, {'product': 'Credit Reporting', 'issue': 'Improper use of your report'}, {'product': 'Credit Reporting', 'issue': "Problem with a credit reporting company's investigation into an existing problem"}, {'product': 'Credit Reporting', 'issue': 'Unable to get your credit report or credit score'}, {'product': 'Credit Reporting', 'issue': 'Incorrect information on your report'}, {'product': 'Debt Collection', 'issue': 'Disclosure verification of debt'}, {'product': 'Debt Collection', 'issue': 'Improper contact or sharing of info'}, {'product': 'Debt Collection', 'issue': 'False statements or representation'}, {'product': 'Debt Collection', 'issue': 'Communication tactics'}, {'product': 'Debt Collection', 'issue': 'Attempts to collect debt not owed'}, {'product': 'Debt Collection', 'issue': "Cont'd attempts collect debt not owed"}, {'product': 'Debt Collection', 'issue': 'Taking/threatening an illegal action'}, {'product': 'Debt Collection', 'issue': 'Written notification about debt'}, {'product': 'Debt Collection', 'issue': 'Took or threatened to take negative or legal action'}, {'product': 'Debt Collection', 'issue': 'Threatened to contact someone or share information improperly'}, {'product': 'Loans', 'issue': 'Dealing with my lender or servicer'}, {'product': 'Loans', 'issue': 'Managing the loan or lease'}, {'product': 'Loans', 'issue': 'Struggling to repay your loan'}, {'product': 'Loans', 'issue': 'Getting a loan or lease'}, {'product': 'Loans', 'issue': 'Problems at the end of the loan or lease'}, {'product': 'Loans', 'issue': 'Unable to get your credit report or credit score'}, {'product': 'Loans', 'issue': "Problem with a credit reporting company's investigation into an existing problem"}, {'product': 'Loans', 'issue': 'Credit monitoring or identity theft protection services'}, {'product': 'Loans', 'issue': 'Improper use of your report'}, {'product': 'Loans', 'issue': 'Dealing with your lender or servicer'}, {'product': 'Loans', 'issue': "Can't repay my loan"}, {'product': 'Loans', 'issue': 'Incorrect information on your report'}, {'product': 'Loans', 'issue': 'Struggling to pay your loan'}]
    df_mapping = pd.DataFrame(mappinglist,columns=['product','issue'])
    df_mapping_dict = dict(zip(df_mapping['issue'],df_mapping['product']))

    data_df = pd.read_json(input_json, orient='records')
    probs = pipeline.predict_proba(data_df.complaint).flatten()
    top_probs = np.argsort(-probs)[:5]
    top_probs.sort()
    top_probs = top_probs[::-1]
    top_issues = pipeline.classes_[top_probs].tolist()
    top_products = [df_mapping_dict[issue] for issue in top_issues]
    columns = ['class1','class2','class3','class4','class5','subclass1','subclass2','subclass3','subclass4','subclass5','subclass1_score','subclass2_score','subclass3_score','subclass4_score','subclass5_score']
    df_result = pd.DataFrame(None, columns = columns) 
    df_result.loc[len(df_result)] = np.append(np.append(top_products,top_issues),top_probs)
    return {'predictions': df_result.to_dict(orient='records')[0]}
"""

exec(scoring_script)
with open("scoring_script.py", "w") as file:
    file.write(scoring_script)
    
scoring_script_file_name = 'scoring_script.py'

#test locally
json_test_data = df_train.head(1).to_json(orient='records')
print(json_test_data)
init()
run(json_test_data)

In [None]:
from azureml.core.conda_dependencies import CondaDependencies 

# Add the dependencies for our model (AzureML defaults is already included)
myenv = CondaDependencies()
myenv.add_conda_package('scikit-learn=0.22.2.post1')
myenv.add_conda_package('joblib')
myenv.add_pip_package("azureml-model-management-sdk")
myenv.add_pip_package("pandas")
myenv.add_pip_package("numpy")


# Save the environment config as a .yml file
env_file = "my_env.yml"
with open(env_file,"w") as f:
    f.write(myenv.serialize_to_string())
print("Saved dependency info in", env_file)

# Print the .yml file
with open(env_file,"r") as f:
    print(f.read())

In [None]:
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig

# Configure the scoring environment
inference_config = InferenceConfig(runtime= "python",
                                   entry_script=scoring_script_file_name,
                                   conda_file=env_file)

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

service_name = "ccm-service-3"

service = Model.deploy(ws, service_name, [registered_model], inference_config, deployment_config)

service.wait_for_deployment(show_output =True)
print(service.state)