Load and prepare the data

In [13]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [14]:
import pandas as pd
path = "/Users/tom.willcocks/Downloads/Copy of ISS_technologies_to_review_August_10.xlsx"
df = pd.read_excel(path, sheet_name="Heat pumps")

print("0:", (df["hit"] == 0).sum())
print("0.5:", (df["hit"] == 0.5).sum())
print("1:", (df["hit"] == 1).sum())

0: 28
0.5: 35
1: 99


In [15]:
# Make the 0.5s 1s

df.loc[df["hit"] == 0.5, "hit"] = 1

print("0:", (df["hit"] == 0).sum())
print("0.5:", (df["hit"] == 0.5).sum())
print("1:", (df["hit"] == 1).sum())

0: 28
0.5: 0
1: 134


In [16]:
import json

new_path = "/Users/tom.willcocks/Downloads/gtrprojects_sample.txt"
with open(new_path, 'r') as file:
    neg_descriptions = json.load(file)
    
print(len(neg_descriptions), neg_descriptions[0])

106 Abstracts are not currently available in GtR for all funded research. This is normally because the abstract was not required at the time of proposal submission, but may be because it included sensitive information such as personal details.


In [17]:
# Create a new DataFrame from neg_descriptions
df_negs = pd.DataFrame({
    'description': neg_descriptions,
    'hit': [0] * len(neg_descriptions)
})

print(df_negs.head())

                                         description  hit
0  Abstracts are not currently available in GtR f...    0
1  With Covid-19 causing significant limitations ...    0
2  Laser peening (LP) is a relatively new surface...    0
3  Doctoral Training Partnerships: a range of pos...    0
4  Applications are invited for a PhD position in...    0


In [18]:
# Use pd.concat to append the new records to the original DataFrame
df = pd.concat([df, df_negs], ignore_index=True)

print("0:", (df["hit"] == 0).sum())
print("0.5:", (df["hit"] == 0.5).sum())
print("1:", (df["hit"] == 1).sum())

0: 134
0.5: 0
1: 134


In [19]:
# Iterate over the DataFrame
for index, row in df.iterrows():
    # Check if the current record in the "description" column is not a string
    if not isinstance(row["description"], str):
        # Print the record (or index) that isn't a string
        print(f"Record at index {index} is not a string: {row['description']}")


In [20]:
# Sentences are encoded by calling model.encode()
embeddings = model.encode(df["description"])

print(type(embeddings),len(embeddings),len(embeddings[0]))

<class 'numpy.ndarray'> 268 384


Split the data

In [21]:
from sklearn.model_selection import train_test_split

# Your embeddings array is X, and the "hit" column in your DataFrame is y
X = embeddings
y = df["hit"].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train the model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Initialize the Logistic Regression classifier
# The 'max_iter' parameter specifies the maximum number of iterations taken for the solvers to converge
clf = LogisticRegression(max_iter=100, random_state=42)

# Train the model on the training set
clf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8888888888888888


Evaluate the model

In [23]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the testing set
y_pred = clf.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Additionally, you can print a classification report for a more detailed performance analysis
print(classification_report(y_test, y_pred))

Accuracy: 0.8888888888888888
              precision    recall  f1-score   support

         0.0       0.89      0.81      0.85        21
         1.0       0.89      0.94      0.91        33

    accuracy                           0.89        54
   macro avg       0.89      0.87      0.88        54
weighted avg       0.89      0.89      0.89        54



In [24]:
from joblib import dump

# Save the model to a file
model_path = "/Users/tom.willcocks/Documents/code/discovery_utils/tests/ml_experimentation/heat_pump_model_logreg.joblib"
dump(clf, model_path)

['/Users/tom.willcocks/Documents/code/discovery_utils/tests/ml_experimentation/heat_pump_model_logreg.joblib']