# Imports

In [14]:
!pip install pandas
!pip install scikit-learn



In [15]:
# Necessary imports and dependencies
import json
import pandas as pd
import re
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Pre-processing

In [16]:
""" Removes unnecessary symbols from the text """


def clean_text(s: str):
    # Only retain alphanumeric and whitespace characters
    s = re.sub(pattern=rf"|[^a-zA-Z0-9\s]", repl="", string=s, flags=re.IGNORECASE)

    # Convert to lowercase
    s = s.lower()

    # Remove extra whitespaces
    s = re.sub(pattern=r"\s+", repl=" ", string=s).strip()

    return s


""" Implements pipeline of pre-processing techniques """


def preprocess(text: str):
    return clean_text(text)

# Vectorizer

In [17]:
# Load the training dataset
file_path = "data/train.json1"

data = []
with open(file_path, "r") as f:
    # Each line in `train.json1` corresponds to a record
    for line in f:
        data.append(json.loads(line.strip()))

df = pd.json_normalize(data)

df["cleaned"] = df["text"].apply(preprocess)

df

Unnamed: 0,text,label,cleaned
0,− Scope 3: Optional scope that includes indire...,1,scope 3 optional scope that includes indirect ...
1,The Group is not aware of any noise pollution ...,0,the group is not aware of any noise pollution ...
2,Global climate change could exacerbate certain...,0,global climate change could exacerbate certain...
3,Setting an investment horizon is part and parc...,0,setting an investment horizon is part and parc...
4,Climate change the physical impacts of climate...,0,climate change the physical impacts of climate...
...,...,...,...
995,Greenhouse gas Mitigation Measures Our five ye...,1,greenhouse gas mitigation measures our five ye...
996,We have updated our external sector statements...,1,we have updated our external sector statements...
997,STOREBRAND'S USE Task Force on Climate-related...,0,storebrands use task force on climaterelated f...
998,Estimations of nanced emissions indicate the i...,1,estimations of nanced emissions indicate the i...


In [18]:
# Initialize the vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=0.05, max_df=0.85)

X = vectorizer.fit_transform(df["cleaned"])

pickle.dump(vectorizer, open("vectorizer.sav", "wb"))

# Model Training

In [19]:
# Split into training and test sets
X_train, y_train = X, df["label"]
# X_test, y_test =

# Initialize MultinomialNB
nb_model = MultinomialNB()

# Train the model
nb_model.fit(X_train, y_train)

# Save the model
filename = "trad_model.sav"
pickle.dump(nb_model, open(filename, "wb"))

# Load the model
load_model = pickle.load(open(filename, "rb"))

test_input = list(map(preprocess, ["Climate change is a global issue."]))

print(load_model.predict(vectorizer.transform(test_input)))

[0]


In [20]:
df_test = pd.read_csv("data/dev.csv")
df_test["cleaned"] = df_test["text"].apply(preprocess)

X_test, y_test = vectorizer.transform(df_test["cleaned"]), df_test["label"]
y_pred = nb_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.74375

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.87      0.84        53
           1       0.75      0.80      0.77        81
           2       0.50      0.31      0.38        26

    accuracy                           0.74       160
   macro avg       0.68      0.66      0.66       160
weighted avg       0.73      0.74      0.73       160



# Testing New Data

In [21]:
# New data to test the model
new_data = [
    "Climate change is a global issue.",
    "Green initiatives combat climate change.",
    "Climate change affects everyone.",
    "Global warming is alarming.",
]

new_data = list(map(lambda x: preprocess(x), new_data))

# Preprocess and transform the new data
new_X = vectorizer.transform(new_data)

predictions = nb_model.predict(new_X)

# Map the predictions to labels (if applicable)
class_names = {0: "Risk", 1: "Neutral", 2: "Opportunity"}
predicted_labels = [class_names[label] for label in predictions]

# Display predictions
for text, label in zip(new_data, predicted_labels):
    print(f"Text: {text}\nPredicted Label: {label}\n")

Text: climate change is a global issue
Predicted Label: Risk

Text: green initiatives combat climate change
Predicted Label: Opportunity

Text: climate change affects everyone
Predicted Label: Risk

Text: global warming is alarming
Predicted Label: Neutral

