### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import json
import random
from sklearn.model_selection import train_test_split
import pickle


nltk.download('punkt')
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hamza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Reading Data

In [2]:
df_testset1 = pd.read_excel("testset1.xlsx")
df_train = pd.read_excel("train.xlsx")
df_testset2 = pd.read_excel("testset2.xlsx")
df_testset4 = pd.read_excel("testset4.xlsx")

df_testset1 = df_testset1.rename(columns={"Definition": "Learning Message"})
df_testset1 = df_testset1[['Learning Message', 'L8 code']]
df_testset2 = df_testset2.rename(columns={"Definition": "Learning Message"})
df_testset2 = df_testset2[['Learning Message', 'L8 code']]
df_testset4 = df_testset4.rename(columns={"Definition": "Learning Message"})
df_testset4 = df_testset4[['Learning Message', 'L8 code']]
df_train = df_train[['Learning Message', 'L8 code']]

df = pd.concat([df_train, df_testset1, df_testset2, df_testset4])
df = df.sample(frac=1).reset_index(drop=True)

df

Unnamed: 0,Learning Message,L8 code
0,Develop partnerships with local organizations ...,SettingGoals
1,"Unfortunately, I have to acknowledge that my p...",PastDFWork
2,Directions\nRead the story below and then work...,EffectsCB
3,Ensure AI and machine learning applications do...,DigitalEntrepreneurMinimizeRisk
4,DirectionsWrite two things you learned about h...,PrioritizeLife
...,...,...
14615,Think twice about sharing your real name on so...,AnonyPI
14616,Not all online actions have the same impact.,DefDF
14617,Evaluate the impact on affected individuals an...,StrategyData
14618,Enable two-factor authentication on accounts f...,PurchaseOnline


### Split data into train-test

In [3]:
df_train = df[:int(0.9*len(df))].copy()
df_testset1 =  df[int(0.9*len(df)):].copy()

In [4]:
common_values = df_train['L8 code'].isin(df_testset1['L8 code'])
df_train = df_train[common_values]
testset1 = df_testset1[df_testset1['L8 code'].isin(df_train['L8 code'])]

In [5]:
df_train

Unnamed: 0,Learning Message,L8 code
0,Develop partnerships with local organizations ...,SettingGoals
1,"Unfortunately, I have to acknowledge that my p...",PastDFWork
2,Directions\nRead the story below and then work...,EffectsCB
3,Ensure AI and machine learning applications do...,DigitalEntrepreneurMinimizeRisk
4,DirectionsWrite two things you learned about h...,PrioritizeLife
...,...,...
13153,"Before the video, ask: Have you ever noticed a...",MediaInfKnow
13154,Share how meeting an online friend impacted yo...,StrangerRisk
13155,“THE 3-2-1 RULE”!3That means playing games LES...,STEnt
13156,• Collect ads that use these techniques. Make...,DigMarketing


In [6]:
df_testset1

Unnamed: 0,Learning Message,L8 code
13158,Find an article on the Internet about shark fi...,ComposingIPstate
13159,• What about the market position? Have these c...,KnowBiz
13160,Prioritize researching privacy concerns before...,KnowMyPISafe
13161,Identify whom you should talk back to about it...,FollowChainDF
13162,Turn off your webcam when it's not needed to p...,SkillsLimitPI
...,...,...
14615,Think twice about sharing your real name on so...,AnonyPI
14616,Not all online actions have the same impact.,DefDF
14617,Evaluate the impact on affected individuals an...,StrategyData
14618,Enable two-factor authentication on accounts f...,PurchaseOnline


###  Data Preprocessing

In [7]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return tokens

In [8]:
df_train['preprocessed_text'] = df_train['Learning Message'].apply(preprocess_text)
df_train

Unnamed: 0,Learning Message,L8 code,preprocessed_text
0,Develop partnerships with local organizations ...,SettingGoals,"[develop, partnership, local, organization, bu..."
1,"Unfortunately, I have to acknowledge that my p...",PastDFWork,"[unfortunately, acknowledge, past, online, act..."
2,Directions\nRead the story below and then work...,EffectsCB,"[direction, read, story, work, partner, answer..."
3,Ensure AI and machine learning applications do...,DigitalEntrepreneurMinimizeRisk,"[ensure, ai, machine, learning, application, p..."
4,DirectionsWrite two things you learned about h...,PrioritizeLife,"[directionswrite, two, thing, learned, social,..."
...,...,...,...
13153,"Before the video, ask: Have you ever noticed a...",MediaInfKnow,"[video, ask, ever, noticed, product, recognize..."
13154,Share how meeting an online friend impacted yo...,StrangerRisk,"[share, meeting, online, friend, impacted, per..."
13155,“THE 3-2-1 RULE”!3That means playing games LES...,STEnt,"[rulethat, mean, playing, game, le, time, week..."
13156,• Collect ads that use these techniques. Make...,DigMarketing,"[collect, ad, use, technique, make, display, a..."


In [9]:
df_testset1['preprocessed_text'] = df_testset1['Learning Message'].apply(preprocess_text)
df_testset1

Unnamed: 0,Learning Message,L8 code,preprocessed_text
13158,Find an article on the Internet about shark fi...,ComposingIPstate,"[find, article, internet, shark, finning, url,..."
13159,• What about the market position? Have these c...,KnowBiz,"[market, position, company, managed, develop, ..."
13160,Prioritize researching privacy concerns before...,KnowMyPISafe,"[prioritize, researching, privacy, concern, pu..."
13161,Identify whom you should talk back to about it...,FollowChainDF,"[identify, talk, back, advertiser, company, ma..."
13162,Turn off your webcam when it's not needed to p...,SkillsLimitPI,"[turn, webcam, needed, prevent, unauthorized, ..."
...,...,...,...
14615,Think twice about sharing your real name on so...,AnonyPI,"[think, twice, sharing, real, name, social, me..."
14616,Not all online actions have the same impact.,DefDF,"[online, action, impact]"
14617,Evaluate the impact on affected individuals an...,StrategyData,"[evaluate, impact, affected, individual, offer..."
14618,Enable two-factor authentication on accounts f...,PurchaseOnline,"[enable, twofactor, authentication, account, e..."


### Labels count

In [10]:
print(df_train["L8 code"].value_counts())

L8 code
FactCheckingInfo      186
BasicKnowContRisk     177
KnowHardSoftWare      174
MediaInfKnow          165
BasicContentSkills    139
                     ... 
RecognizeAIinTools     41
DealCon                41
AttTechPI              37
FamilyTechAware        33
MutualApproval         29
Name: count, Length: 208, dtype: int64


### TF-IDF (Term Frequency-Inverse Document Frequency):

In [40]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(df_train['preprocessed_text'].apply(lambda x: ' '.join(x)))
tfidf_test = tfidf_vectorizer.transform(df_testset1["preprocessed_text"].apply(lambda x: ' '.join(x)))

In [41]:
tfidf_vectors

<13158x14636 sparse matrix of type '<class 'numpy.float64'>'
	with 232368 stored elements in Compressed Sparse Row format>

In [42]:
tfidf_test

<1462x14636 sparse matrix of type '<class 'numpy.float64'>'
	with 25135 stored elements in Compressed Sparse Row format>

### Label Encode L8,L9

In [43]:
train_8=list(df_train["L8 code"].tolist())
test_8=list(df_testset1["L8 code"].tolist())

In [44]:
eight=list(set(train_8+test_8))
l8={value: index for index, value in enumerate(eight)}

In [45]:
df_train["L8"]=df_train["L8 code"].map(l8)
df_testset1["L8"]=df_testset1["L8 code"].map(l8)

### Logistic Regression

In [17]:
logreg_model = LogisticRegression()

logreg_model.fit(tfidf_vectors, df_train["L8 code"])

y_pred_logreg = logreg_model.predict(tfidf_test)

accuracy_logreg = accuracy_score(df_testset1["L8 code"], y_pred_logreg)
accuracy = accuracy_score(df_testset1["L8 code"], y_pred_logreg)
precision = precision_score(df_testset1["L8 code"], y_pred_logreg, average='weighted')
recall = recall_score(df_testset1["L8 code"], y_pred_logreg, average='weighted')
f1 = f1_score(df_testset1["L8 code"], y_pred_logreg, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.38645690834473323
Precision: 0.4661062652177826
Recall: 0.38645690834473323
F1-score: 0.37221063881171385


  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree

In [18]:
decision_tree_model = DecisionTreeClassifier()

decision_tree_model.fit(tfidf_vectors, df_train["L8 code"])

y_pred_dt = decision_tree_model.predict(tfidf_test)

accuracy_logreg = accuracy_score(df_testset1["L8 code"], y_pred_dt)
accuracy = accuracy_score(df_testset1["L8 code"], y_pred_dt)
precision = precision_score(df_testset1["L8 code"], y_pred_dt, average='weighted')
recall = recall_score(df_testset1["L8 code"], y_pred_dt, average='weighted')
f1 = f1_score(df_testset1["L8 code"], y_pred_dt, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.2694938440492476
Precision: 0.28898994871977085
Recall: 0.2694938440492476
F1-score: 0.2655522401945359


### Random Forest

In [19]:
random_forest_model = RandomForestClassifier()

random_forest_model.fit(tfidf_vectors, df_train["L8 code"])

y_pred_rf = random_forest_model.predict(tfidf_test)

accuracy_logreg = accuracy_score(df_testset1["L8 code"], y_pred_rf)
accuracy = accuracy_score(df_testset1["L8 code"], y_pred_rf)
precision = precision_score(df_testset1["L8 code"], y_pred_rf, average='weighted')
recall = recall_score(df_testset1["L8 code"], y_pred_rf, average='weighted')
f1 = f1_score(df_testset1["L8 code"], y_pred_rf, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.38919288645690836
Precision: 0.4358984420104955
Recall: 0.38919288645690836
F1-score: 0.37159902897167657


### SVM

In [46]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer

svm_model = SVC(kernel='linear')

svm_model.fit(tfidf_vectors, df_train["L8 code"])
y_pred_svm = svm_model.predict(tfidf_test)

accuracy_svm = accuracy_score(df_testset1["L8 code"], y_pred_svm)
precision_svm = precision_score(df_testset1["L8 code"], y_pred_svm, average='weighted')
recall_svm = recall_score(df_testset1["L8 code"], y_pred_svm, average='weighted')
f1_svm = f1_score(df_testset1["L8 code"], y_pred_svm, average='weighted')

print("SVM Classifier Metrics:")
print("Accuracy:", accuracy_svm)
print("Precision:", precision_svm)
print("Recall:", recall_svm)
print("F1-score:", f1_svm)

SVM Classifier Metrics:
Accuracy: 0.4418604651162791
Precision: 0.49438273600798954
Recall: 0.4418604651162791
F1-score: 0.43899869076598363


  _warn_prf(average, modifier, msg_start, len(result))


# Using another embedding technique

### Glove Vectors

In [21]:
import gensim.downloader as api

glove_vectors = api.load("glove-wiki-gigaword-300")


def embed(x):
    tokens = x
    embedded_tokens = []
    for token in tokens:
        if token in glove_vectors:
            embedded_tokens.append(glove_vectors[token])
    return np.mean(embedded_tokens, axis=0)

In [22]:
train = df_train["preprocessed_text"].apply(embed)
test = df_testset1["preprocessed_text"].apply(embed)

In [23]:
train

0        [-0.18963581, 0.008763462, -0.065585226, -0.33...
1        [-0.09119807, 0.15561, -0.028068157, -0.101118...
2        [-0.050087072, 0.048103854, -0.16219346, -0.08...
3        [-0.03390626, 0.02402198, -0.04094587, -0.0901...
4        [-0.03811245, 0.05907425, 0.099199995, -0.1213...
                               ...                        
13153    [-0.07161873, 0.3926502, 0.005231486, -0.02639...
13154    [-0.32081452, 0.025867004, -0.07973113, -0.151...
13155    [-0.21206024, 0.24310327, -0.0117567135, -0.08...
13156    [0.024368001, 0.28551495, 0.045916714, -0.1914...
13157    [-0.15403785, 0.053178158, 0.008383612, -0.018...
Name: preprocessed_text, Length: 13158, dtype: object

In [24]:
y_train = df_train["L8 code"]
y_test = df_testset1["L8 code"]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train.tolist())
X_test_scaled = scaler.transform(test.tolist())

### Logistic Regression

In [25]:
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)
y_pred = logreg.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Accuracy: 0.32694938440492477
Precision: 0.35859009779505474
Recall: 0.32694938440492477
F1-score: 0.3295519215044941


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


### Decision Tree

In [26]:
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train_scaled, y_train)
dt_y_pred = dt_classifier.predict(X_test_scaled)

dt_accuracy = accuracy_score(y_test, dt_y_pred)
dt_precision = precision_score(y_test, dt_y_pred, average='weighted')
dt_recall = recall_score(y_test, dt_y_pred, average='weighted')
dt_f1 = f1_score(y_test, dt_y_pred, average='weighted')

print("Decision Tree Classifier:")
print("Accuracy:", dt_accuracy)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1-score:", dt_f1)

Decision Tree Classifier:
Accuracy: 0.0677154582763338
Precision: 0.08131331273026139
Recall: 0.0677154582763338
F1-score: 0.06739887131004903


  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest

In [27]:
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_scaled, y_train)
rf_y_pred = rf_classifier.predict(X_test_scaled)

rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_precision = precision_score(y_test, rf_y_pred, average='weighted')
rf_recall = recall_score(y_test, rf_y_pred, average='weighted')
rf_f1 = f1_score(y_test, rf_y_pred, average='weighted')

print("\nRandom Forest Classifier:")
print("Accuracy:", rf_accuracy)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1-score:", rf_f1)

  _warn_prf(average, modifier, msg_start, len(result))



Random Forest Classifier:
Accuracy: 0.2708618331053352
Precision: 0.28514842757177156
Recall: 0.2708618331053352
F1-score: 0.24873928848703306


### SVM

In [28]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

svm_classifier = SVC()
svm_classifier.fit(X_train_scaled, y_train)
svm_y_pred = svm_classifier.predict(X_test_scaled)

svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_precision = precision_score(y_test, svm_y_pred, average='weighted')
svm_recall = recall_score(y_test, svm_y_pred, average='weighted')
svm_f1 = f1_score(y_test, svm_y_pred, average='weighted')

print("\nSupport Vector Machine Classifier:")
print("Accuracy:", svm_accuracy)
print("Precision:", svm_precision)
print("Recall:", svm_recall)
print("F1-score:", svm_f1)

  _warn_prf(average, modifier, msg_start, len(result))



Support Vector Machine Classifier:
Accuracy: 0.41381668946648426
Precision: 0.4801125955038228
Recall: 0.41381668946648426
F1-score: 0.4134383154442754


### Best Model is SVM

### Testing for testset1, 2 and 4

In [73]:
def test(filee):
    t1=pd.read_excel(filee)
    t1['preprocessed_text'] = t1['Definition'].apply(preprocess_text)
    t1_vector = TfidfVectorizer()
    t1_vectors = tfidf_vectorizer.transform(t1['preprocessed_text'].apply(lambda x: ' '.join(x)))
    t1["L8"]=t1["L8 code"].map(l8)

    y_pred_svm = svm_model.predict(t1_vectors)

    accuracy_svm = accuracy_score(t1["L8 code"], y_pred_svm)
    precision_svm = precision_score(t1["L8 code"], y_pred_svm, average='weighted')
    recall_svm = recall_score(t1["L8 code"], y_pred_svm, average='weighted')
    f1_svm = f1_score(t1["L8 code"], y_pred_svm, average='weighted')

    print("SVM Classifier Metrics:")
    print("Accuracy:", accuracy_svm)
    print("Precision:", precision_svm)
    print("Recall:", recall_svm)
    print("F1-score:", f1_svm)

    return y_pred_svm,t1["L8 code"] 

In [79]:
pred,true=test("testset1.xlsx")
pd.concat([pd.DataFrame(pred),true],axis=1).to_csv("testset1_result.csv")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Classifier Metrics:
Accuracy: 0.8533333333333334
Precision: 0.8625185185185185
Recall: 0.8533333333333334
F1-score: 0.8439906629318394


In [80]:
pred,true=test("testset2.xlsx")
pd.concat([pd.DataFrame(pred),true],axis=1).to_csv("testset2_result.csv")

SVM Classifier Metrics:
Accuracy: 0.8249427917620137
Precision: 0.8290219696566576
Recall: 0.8249427917620137
F1-score: 0.8157534326721272


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
pred,true=test("testset4.xlsx")
pd.concat([pd.DataFrame(pred),true],axis=1).to_csv("testset4_result.csv")

SVM Classifier Metrics:
Accuracy: 0.8473592571096924
Precision: 0.8601832655662801
Recall: 0.8473592571096924
F1-score: 0.845834193298301


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
