In [1]:
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BertConfig

# Load tokenizer from original model (critical for compatibility)
tokenizer = AutoTokenizer.from_pretrained("sadickam/sdgBERT")

# Load config with 17 labels
config = BertConfig.from_pretrained("/Users/mass/Documents/Masters/Courses/Connected Politics/Model_params_backup/config.json")

# Load model weights
model = AutoModelForSequenceClassification.from_pretrained(
    "/Users/mass/Documents/Masters/Courses/Connected Politics/Model_params_backup/model.safetensors",
    config = config
)

In [3]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("/Users/mass/Documents/Masters/Courses/Connected Politics/Model_params_backup")
print("Label Mappings:", config.id2label)
# Should output: {0: 'SDG_0', 1: 'SDG_1', ..., 16: 'SDG_16'}

Label Mappings: {0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2', 3: 'LABEL_3', 4: 'LABEL_4', 5: 'LABEL_5', 6: 'LABEL_6', 7: 'LABEL_7', 8: 'LABEL_8', 9: 'LABEL_9', 10: 'LABEL_10', 11: 'LABEL_11', 12: 'LABEL_12', 13: 'LABEL_13', 14: 'LABEL_14', 15: 'LABEL_15', 16: 'LABEL_16'}


In [4]:
def process_discourse(file_path, subfolder):
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Tokenize and encode the text
    inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
    
    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get probabilities
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Create a dictionary with results
    results = {
        'Unique_ID': os.path.basename(file_path),
        'Subfolder': subfolder
    }
    
    # Add probabilities for all available SDGs
    for i in range(probs.shape[1]):
        results[f'SDG_{i}'] = probs[0][i].item()
    
    return results




In [5]:
# Specify the folder containing discourse files
main_folder = "/Users/mass/Documents/Masters/Courses/Connected Politics/Github repository/Content/UN Corpus"
results = []

In [6]:
# Process all discourse files
for subfolder in os.listdir(main_folder):
    subfolder_path = os.path.join(main_folder, subfolder)
    if os.path.isdir(subfolder_path):
        for file_name in os.listdir(subfolder_path):
            if file_name.endswith('.txt'):
                file_path = os.path.join(subfolder_path, file_name)
                result = process_discourse(file_path, subfolder)
                results.append(result)


KeyboardInterrupt: 

In [7]:
# Create a DataFrame from the results
if results:
    df_results = pd.DataFrame(results)
else:
    df_results = pd.DataFrame(columns=['Unique_ID', 'Subfolder'])  # Create empty DataFrame if no results

In [8]:
# Reorder columns to ensure SDGs are in order
sdg_columns = [col for col in df_results.columns if col.startswith('SDG_')]
column_order = ['Unique_ID', 'Subfolder'] + sorted(sdg_columns, key=lambda x: int(x.split('_')[1]))
df_results = df_results[column_order]



In [9]:
# Display the first few rows of the results
print(df_results.head())


         Unique_ID          Subfolder     SDG_0     SDG_1     SDG_2     SDG_3  \
0  BRB_73_2018.txt  Session 73 - 2018  0.001588  0.007611  0.011333  0.002075   
1  IND_73_2018.txt  Session 73 - 2018  0.001663  0.538183  0.020612  0.013848   
2  ARG_73_2018.txt  Session 73 - 2018  0.003995  0.022468  0.023860  0.005524   
3  JOR_73_2018.txt  Session 73 - 2018  0.000695  0.000713  0.000554  0.000385   
4  SWE_73_2018.txt  Session 73 - 2018  0.003677  0.013726  0.012967  0.008263   

      SDG_4     SDG_5     SDG_6     SDG_7     SDG_8     SDG_9    SDG_10  \
0  0.004593  0.014043  0.005275  0.016655  0.006817  0.002211  0.003724   
1  0.014172  0.282126  0.004114  0.007417  0.050169  0.002652  0.035352   
2  0.014603  0.119669  0.004548  0.007284  0.004918  0.003729  0.011026   
3  0.000816  0.001283  0.000313  0.000434  0.000326  0.000527  0.000842   
4  0.013773  0.386075  0.002997  0.009276  0.007336  0.003656  0.013995   

     SDG_11    SDG_12    SDG_13    SDG_14    SDG_15    SDG_16 

In [10]:
len(df_results)

3096

In [11]:
df_results["Subfolder"].str[-4:]


0       2018
1       2018
2       2018
3       2018
4       2018
        ... 
3091    2014
3092    2014
3093    2014
3094    2014
3095    2014
Name: Subfolder, Length: 3096, dtype: object

In [12]:
df_results["Unique_ID"] = df_results["Unique_ID"].str.replace(".txt", "")
df_results["Subfolder"] = df_results["Subfolder"].str[-4:]


In [13]:
df_results.rename(columns={"Unique_ID": "Id", "Subfolder": "Year"}, inplace=True)

In [14]:
df_results.head()

Unnamed: 0,Id,Year,SDG_0,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16
0,BRB_73_2018,2018,0.001588,0.007611,0.011333,0.002075,0.004593,0.014043,0.005275,0.016655,0.006817,0.002211,0.003724,0.00762,0.00595,0.895107,0.003156,0.00824,0.004
1,IND_73_2018,2018,0.001663,0.538183,0.020612,0.013848,0.014172,0.282126,0.004114,0.007417,0.050169,0.002652,0.035352,0.002824,0.005582,0.007948,0.004038,0.005454,0.003846
2,ARG_73_2018,2018,0.003995,0.022468,0.02386,0.005524,0.014603,0.119669,0.004548,0.007284,0.004918,0.003729,0.011026,0.004664,0.009199,0.005282,0.007612,0.006535,0.745083
3,JOR_73_2018,2018,0.000695,0.000713,0.000554,0.000385,0.000816,0.001283,0.000313,0.000434,0.000326,0.000527,0.000842,0.000647,0.000828,0.000406,0.000526,0.000866,0.989839
4,SWE_73_2018,2018,0.003677,0.013726,0.012967,0.008263,0.013773,0.386075,0.002997,0.009276,0.007336,0.003656,0.013995,0.005404,0.008823,0.008216,0.006958,0.011167,0.483693


In [15]:
#Import speakers by session

speakers = pd.read_excel("/Users/mass/Documents/Masters/Courses/Connected Politics/Github repository/Content/UN Corpus/Speakers_by_session.xlsx")
speakers.head()

Unnamed: 0,Year,Session,ISO Code,Country,Name of Person Speaking,Post,ID
0,2023,78,BRA,Brazil,Luiz Inacio Lula da Silva,President,BRA_78_2023
1,2023,78,USA,United States of America,Joseph R. Biden,President,USA_78_2023
2,2023,78,COL,Colombia,Gustavo Petro Urrego,President,COL_78_2023
3,2023,78,JOR,Jordan,Abdullah II ibn Al Hussein,King,JOR_78_2023
4,2023,78,POL,Poland,Andrzej Duda,President,POL_78_2023


In [16]:
df_results = df_results.merge(speakers[["ID", "Post"]], left_on = "Id", right_on = "ID", how = "left")

In [17]:
df_results = df_results[['Id', 'Year', 'Post', 'SDG_0', 'SDG_1', 'SDG_2', 'SDG_3', 'SDG_4', 'SDG_5', 'SDG_6',
                         'SDG_7', 'SDG_8', 'SDG_9', 'SDG_10', 'SDG_11', 'SDG_12', 'SDG_13',
                         'SDG_14', 'SDG_15', 'SDG_16']]

In [18]:
df_results.head()

Unnamed: 0,Id,Year,Post,SDG_0,SDG_1,SDG_2,SDG_3,SDG_4,SDG_5,SDG_6,SDG_7,SDG_8,SDG_9,SDG_10,SDG_11,SDG_12,SDG_13,SDG_14,SDG_15,SDG_16
0,BRB_73_2018,2018,"Prime Minister, Minister for National Security...",0.001588,0.007611,0.011333,0.002075,0.004593,0.014043,0.005275,0.016655,0.006817,0.002211,0.003724,0.00762,0.00595,0.895107,0.003156,0.00824,0.004
1,IND_73_2018,2018,Minister for External Affairs,0.001663,0.538183,0.020612,0.013848,0.014172,0.282126,0.004114,0.007417,0.050169,0.002652,0.035352,0.002824,0.005582,0.007948,0.004038,0.005454,0.003846
2,ARG_73_2018,2018,President,0.003995,0.022468,0.02386,0.005524,0.014603,0.119669,0.004548,0.007284,0.004918,0.003729,0.011026,0.004664,0.009199,0.005282,0.007612,0.006535,0.745083
3,JOR_73_2018,2018,King,0.000695,0.000713,0.000554,0.000385,0.000816,0.001283,0.000313,0.000434,0.000326,0.000527,0.000842,0.000647,0.000828,0.000406,0.000526,0.000866,0.989839
4,SWE_73_2018,2018,Chair of the Delegation,0.003677,0.013726,0.012967,0.008263,0.013773,0.386075,0.002997,0.009276,0.007336,0.003656,0.013995,0.005404,0.008823,0.008216,0.006958,0.011167,0.483693


In [19]:
len(df_results)

3096

In [20]:

# Save the results to a CSV file
df_results.to_csv('/Users/mass/Documents/Masters/Courses/Connected Politics/Github repository/Content/Datasets and code/Datasets/Aux datasets/sdg_analysis_results_v2.csv', index = False)

In [22]:
df_results["SDG_0"].mean()

0.0022042499526351354