In [1]:
!pip install nltk
!pip install

[31mERROR: You must give at least one requirement to install (see "pip help install")[0m[31m
[0m

In [2]:
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


agreements = [
    "This Employment Agreement is made as of the 1st day of January 2023, between John Doe (\"Employee\") and ABC Corporation (\"Employer\"). The Employee agrees to perform the duties of Software Engineer. The employment is at-will and can be terminated by either party at any time.",
    "This Employment Agreement is entered into on February 15, 2023, by Jane Smith (\"Employee\") and XYZ Inc. (\"Employer\"). The Employee will serve as a Data Analyst. This agreement shall remain in effect until terminated by either party with a two weeks' notice."
]

# Initialize nltk and spacy
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load('en_core_web_sm')

# Preprocessing function
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Stop-word Removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

# Apply preprocessing
preprocessed_agreements = [preprocess_text(agreement) for agreement in agreements]

preprocessed_agreements


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


[['Employment',
  'Agreement',
  'made',
  '1st',
  'day',
  'January',
  '2023',
  ',',
  'John',
  'Doe',
  '(',
  '``',
  'Employee',
  "''",
  ')',
  'ABC',
  'Corporation',
  '(',
  '``',
  'Employer',
  "''",
  ')',
  '.',
  'Employee',
  'agrees',
  'perform',
  'duty',
  'Software',
  'Engineer',
  '.',
  'employment',
  'at-will',
  'terminated',
  'either',
  'party',
  'time',
  '.'],
 ['Employment',
  'Agreement',
  'entered',
  'February',
  '15',
  ',',
  '2023',
  ',',
  'Jane',
  'Smith',
  '(',
  '``',
  'Employee',
  "''",
  ')',
  'XYZ',
  'Inc.',
  '(',
  '``',
  'Employer',
  "''",
  ')',
  '.',
  'Employee',
  'serve',
  'Data',
  'Analyst',
  '.',
  'agreement',
  'shall',
  'remain',
  'effect',
  'terminated',
  'either',
  'party',
  'two',
  'week',
  "'",
  'notice',
  '.']]

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample data
agreements = [
    "This Employment Agreement is made as of the 1st day of January 2023, between John Doe (\"Employee\") and ABC Corporation (\"Employer\"). The Employee agrees to perform the duties of Software Engineer. The employment is at-will and can be terminated by either party at any time.",
    "This Employment Agreement is entered into on February 15, 2023, by Jane Smith (\"Employee\") and XYZ Inc. (\"Employer\"). The Employee will serve as a Data Analyst. This agreement shall remain in effect until terminated by either party with a two weeks' notice."
]

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(agreements)

# Feature names
feature_names = vectorizer.get_feature_names_out()

# Convert to DataFrame for better readability
import pandas as pd
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=feature_names)
df_tfidf


Unnamed: 0,15,1st,2023,abc,agreement,agrees,analyst,and,any,as,...,the,this,time,to,two,until,weeks,will,with,xyz
0,0.0,0.137022,0.097492,0.137022,0.097492,0.137022,0.0,0.194985,0.137022,0.097492,...,0.389969,0.097492,0.137022,0.137022,0.0,0.0,0.0,0.097492,0.0,0.0
1,0.169837,0.0,0.12084,0.0,0.241681,0.0,0.169837,0.12084,0.0,0.12084,...,0.12084,0.241681,0.0,0.0,0.169837,0.169837,0.169837,0.12084,0.169837,0.169837


In [4]:
import spacy

# Load spaCy's pre-trained NER model
nlp = spacy.load('en_core_web_sm')

# Sample data
agreements = [
    "This Employment Agreement is made as of the 1st day of January 2023, between John Doe (\"Employee\") and ABC Corporation (\"Employer\"). The Employee agrees to perform the duties of Software Engineer. The employment is at-will and can be terminated by either party at any time.",
    "This Employment Agreement is entered into on February 15, 2023, by Jane Smith (\"Employee\") and XYZ Inc. (\"Employer\"). The Employee will serve as a Data Analyst. This agreement shall remain in effect until terminated by either party with a two weeks' notice."
]

# Extract entities
for doc in nlp.pipe(agreements):
    print([(ent.text, ent.label_) for ent in doc.ents])


[('This Employment Agreement', 'ORG'), ('the 1st day of January 2023', 'DATE'), ('John Doe', 'PERSON'), ('ABC Corporation', 'ORG'), ('Software Engineer', 'ORG')]
[('This Employment Agreement', 'ORG'), ('February 15, 2023', 'DATE'), ('Jane Smith', 'PERSON'), ('XYZ Inc.', 'ORG'), ('Employee', 'ORG'), ("two weeks'", 'DATE')]


In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

# Sample Data Preparation
# Replace this with loading your actual data
data = {
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100),
    'feature3': np.random.rand(100),
    'target': np.random.randint(0, 2, 100)
}

df = pd.DataFrame(data)
X = df[['feature1', 'feature2', 'feature3']]
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the trained model
model_file_name = 'employment_model.pkl'
with open('employment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

print(f"Model saved as {model_file_name}")


              precision    recall  f1-score   support

           0       0.50      0.60      0.55        10
           1       0.50      0.40      0.44        10

    accuracy                           0.50        20
   macro avg       0.50      0.50      0.49        20
weighted avg       0.50      0.50      0.49        20

Model saved as employment_model.pkl


In [8]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.34.139-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore<1.35.0,>=1.34.139 (from boto3)
  Downloading botocore-1.34.139-py3-none-any.whl (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3)
  Downloading s3transfer-0.10.2-py3-none-any.whl (82 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.7/82.7 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.34.139 botocore-1.34.139 jmespath-1.0.1 s3transfer-0.10.2


In [22]:
!pip install google-cloud-storage




In [24]:
!pip install azureml-core

Collecting azureml-core
  Downloading azureml_core-1.56.0-py3-none-any.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting backports.tempfile (from azureml-core)
  Downloading backports.tempfile-1.0-py2.py3-none-any.whl (4.4 kB)
Collecting pathspec<1.0.0 (from azureml-core)
  Downloading pathspec-0.12.1-py3-none-any.whl (31 kB)
Collecting msal<2.0.0,>=1.15.0 (from azureml-core)
  Downloading msal-1.29.0-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.9/110.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting msal-extensions<=2.0.0,>=0.3.0 (from azureml-core)
  Downloading msal_extensions-1.2.0-py3-none-any.whl (19 kB)
Collecting knack<0.12.0 (from azureml-core)
  Downloading knack-0.11.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCo

In [None]:
from azureml.core import Workspace

# Create a workspace using your subscription details
ws = Workspace.create(name='your_workspace_name',
                      subscription_id='your_subscription_id',
                      resource_group='your_resource_group',
                      create_resource_group=True,
                      location='your_preferred_location')

# Alternatively, if you have already created the workspace
# ws = Workspace.from_config()


In [None]:
import pickle
from azureml.core import Workspace, Model

# Load your workspace
ws = Workspace.from_config()

# Load the trained model
model_filename = 'employment_model.pkl'
with open(model_filename, 'rb') as model_file:
    model = pickle.load(model_file)

# Register the model
model = Model.register(workspace=ws,
                       model_path=model_filename,  # Local file to upload and register as a model
                       model_name='employment_model')  # Name of the model in Azure ML


In [25]:
import json
import numpy as np
import pickle
from azureml.core.model import Model

def init():
    global model
    model_path = Model.get_model_path('employment_model')
    with open(model_path, 'rb') as model_file:
        model = pickle.load(model_file)

def run(raw_data):
    data = np.array(json.loads(raw_data)['data'])
    predictions = model.predict(data)
    return json.dumps(predictions.tolist())


In [26]:
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig

# Create environment
env = Environment.from_conda_specification(name='project_environment', file_path='environment.yml')

# Create inference configuration
inference_config = InferenceConfig(entry_script='score.py', environment=env)


FileNotFoundError: [Errno 2] No such file or directory: 'environment.yml'

In [28]:
from azureml.core.webservice import AciWebservice, Webservice

# Define deployment configuration
deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1)

# Deploy the model
service = Model.deploy(workspace=ws,
                       name='employment-model-service',
                       models=[model],
                       inference_config=inference_config,
                       deployment_config=deployment_config)
service.wait_for_deployment(show_output=True)


NameError: name 'ws' is not defined

In [29]:
print(service.scoring_uri)


NameError: name 'service' is not defined

In [31]:
!pip install Flask




In [32]:
from flask import Flask, request, jsonify
import requests

app = Flask(__name__)

# Replace with your service's scoring URI
scoring_uri = 'your_scoring_uri'

@app.route('/predict', methods=['POST'])
def predict():
    input_data = request.json
    headers = {'Content-Type': 'application/json'}
    response = requests.post(scoring_uri, json=input_data, headers=headers)
    return jsonify(response.json())

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [34]:
import pickle
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split


# Load your test data (replace with your own data loading process)
data = pd.DataFrame(data)
X = df[['feature1', 'feature2', 'feature3']]
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the trained model
model_file_name = 'employment_model.pkl'
with open('employment_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)




              precision    recall  f1-score   support

           0       0.50      0.60      0.55        10
           1       0.50      0.40      0.44        10

    accuracy                           0.50        20
   macro avg       0.50      0.50      0.49        20
weighted avg       0.50      0.50      0.49        20



In [35]:
from sklearn.metrics import classification_report

# Generate a classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.50      0.60      0.55        10
           1       0.50      0.40      0.44        10

    accuracy                           0.50        20
   macro avg       0.50      0.50      0.49        20
weighted avg       0.50      0.50      0.49        20



In [36]:
from sklearn.model_selection import GridSearchCV

# Define a parameter grid to search
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Best Model Accuracy: {accuracy_best:.2f}')

# Print the best parameters found by GridSearchCV
print(f'Best Parameters: {grid_search.best_params_}')


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Model Accuracy: 0.50
Best Parameters: {'max_depth': None, 'n_estimators': 200}


In [37]:
# Retrain the best model on the entire dataset
best_model.fit(X, y)

# Optionally, save the model for deployment
import pickle

model_filename = 'best_employment_model.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(best_model, model_file)

print(f'Best model saved as {model_filename}')


Best model saved as best_employment_model.pkl
