In [1]:
import AI_Frameworks_LDA
from tqdm import tqdm

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/osama.nasir@ebryx.com/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import nltk
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# sklearn
from sklearn.preprocessing import MinMaxScaler

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
#%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# Load dataset

In [5]:
df1 = pd.read_excel('data/SDG_Project_text.xlsx')

In [6]:
len(df1)

108

In [7]:
corpus, id2word, data_lemmatized = AI_Frameworks_LDA.preprocessData(df1)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/osama.nasir@ebryx.com/anaconda3/envs/myenv/lib/python3.7/site-packages/en_core_web_sm
-->
/home/osama.nasir@ebryx.com/anaconda3/envs/myenv/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [8]:
print(len(corpus))
print(len(id2word))
print(len(data_lemmatized))

108
1471
108


# SDG Model

In [7]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-100")

In [9]:
#defining all 17 SDGs
sdg1 = ['africa', 'basic', 'class', 'countries', 'developing', 'disadvantaged', 'discrimination', 'distribution', 'economic', 'end', 'environment', 'equality', 'eradication', 'extreme', 'financial', 'inclusion', 'income', 'life', 'line', 'microfinance', 'poor', 'poverty', 'protection', 'quality', 'resources', 'services', 'social', 'sustainable', 'systems', 'third', 'vulnerable', 'wealth', 'world']
sdg2 = ['agriculture', 'diversity', 'food', 'genetic', 'genetics', 'growth', 'health', 'hunger', 'hungry', 'improved', 'infrastructure', 'innovations', 'legumes', 'life', 'maize', 'malnourished', 'malnutrition', 'needs', 'nourished', 'nutrition', 'nutritional', 'nutritious', 'people', 'poverty', 'produce', 'producers', 'production', 'productivity', 'quality', 'resilient', 'restrictions', 'rural', 'scale', 'seeds', 'small', 'stunted', 'stunting', 'sufficient', 'sustainable', 'trade', 'under', 'undernourished', 'wasting', 'world']
sdg3 = ['abuse', 'access', 'accidents', 'affordable', 'aids', 'air', 'alcohol', 'antenatal', 'antiretroviral', 'autonomy', 'biomedical', 'bodily', 'borne', 'care', 'child', 'clean', 'constrained', 'contamination', 'contraceptive', 'control', 'countries', 'coverage', 'death', 'deaths', 'density', 'dental', 'developing', 'disability', 'disease', 'diseases', 'drug', 'expectancy', 'family', 'health', 'healthcare', 'healthy', 'hepatitis', 'hiv', 'hygiene', 'improving', 'inclusion', 'increasing', 'indigenous', 'infected', 'international', 'life', 'lives', 'location', 'malaria', 'maternal', 'measles', 'medical', 'medicines', 'mental', 'mortality', 'narcotic', 'neonatal', 'organisation', 'planning', 'policy', 'polio', 'politics', 'pollution', 'premature', 'preventable', 'reducing', 'refugees', 'regulations', 'reproductive', 'resource', 'road', 'sanitation', 'services', 'sexual', 'soil', 'substance', 'support', 'therapy', 'tobacco', 'traffic', 'treatment', 'tuberculosis', 'universal', 'use', 'vaccines', 'violence', 'wash', 'water', 'well', 'wellbeing', 'worker', 'world']
sdg4 = ['access', 'basic', 'childhood', 'citizenship', 'cooperation', 'cultural', 'developing', 'development', 'disability', 'disparities', 'disparity', 'diversity', 'education', 'enrolment', 'equal', 'equality', 'equitable', 'equity', 'gender', 'global', 'inclusion', 'inclusive', 'innovation', 'international', 'learning', 'lifelong', 'literacy', 'numeracy', 'opportunities', 'preprimary', 'primary', 'qualified', 'refugees', 'rights', 'scholarships', 'school', 'secondary', 'sensitive', 'skills', 'sustainability', 'teacher', 'teachers', 'training', 'universal', 'vocational', 'vulnerable', 'women']
sdg5 = ['access', 'against', 'basic', 'coverage', 'dignity', 'disadvantaged', 'discrimination', 'employment', 'empower', 'empowerment', 'equal', 'equality', 'exploitation', 'female', 'feminism', 'forced', 'gender', 'genital', 'girls', 'governance', 'health', 'human', 'humanitarian', 'inclusion', 'living', 'marginalised', 'marriage', 'mutilation', 'opportunities', 'parity', 'pay', 'reproductive', 'rights', 'sexual', 'social', 'standards', 'trafficking', 'universal', 'violence', 'women', 'work', 'workplace']
sdg6 = ['access', 'accessible', 'affordable', 'aquifer', 'basins', 'cities', 'clean', 'contaminated', 'defecation', 'desalination', 'diarrhoeal', 'disasters', 'diseases', 'drinking', 'drought', 'dumping', 'ecosystem', 'ecosystems', 'efficiency', 'equitable', 'floods', 'harvesting', 'hydropower', 'hygiene', 'improving', 'inadequate', 'infrastructure', 'irrigation', 'lakes', 'latrines', 'management', 'pollution', 'protection', 'quality', 'recycled', 'resources', 'restoration', 'reuse', 'river', 'rivers', 'safe', 'sanitation', 'scarcity', 'sewerage', 'supply', 'sustainable', 'third', 'toilets', 'treatment', 'untreated', 'urban', 'waste', 'wastewater', 'water', 'water_related', 'water_use', 'withdrawals', 'world']
sdg7 = ['affordable', 'alternative', 'animal', 'battery', 'carbon', 'charcoal', 'clean', 'cleaner', 'climate', 'coal', 'economy', 'efficiency', 'electricity', 'emissions', 'energy', 'fossil', 'fossil_fuel', 'fuel', 'fuels', 'gas', 'goal', 'green', 'greenhouse', 'hydroelectric', 'infrastructure', 'modern', 'power', 'reliable', 'renewable', 'research', 'services', 'solar', 'sustainable', 'technology', 'turbine', 'vehicles', 'waste', 'wave', 'wind', 'wood']
sdg8 = ['aid', 'banking', 'child', 'consumption', 'creation', 'creativity', 'culture', 'development', 'domestic', 'economic', 'economy', 'efficiency', 'employment', 'enterprises', 'entrepreneurship', 'equal', 'eradication', 'finance', 'financial', 'forced', 'gdp', 'global', 'gross', 'growth', 'human', 'inclusive', 'innovation', 'insurance', 'job', 'jobs', 'labour', 'life', 'market', 'micro', 'migrant', 'migrants', 'modern', 'opportunities', 'oriented', 'pay', 'policies', 'policy', 'poverty', 'product', 'production', 'productive', 'productivity', 'public', 'quality', 'resource', 'rights', 'safe', 'secure', 'services', 'slavery', 'social', 'society', 'soldiers', 'stable', 'sustainable', 'tourism', 'trade', 'trafficking', 'unemployment', 'paid', 'women', 'work', 'workers', 'world', 'youth']
sdg9 = ['access', 'affordable', 'banks', 'capabilities', 'chains', 'clean', 'communication', 'cooperation', 'countries', 'credit', 'data', 'developing', 'development', 'diversification', 'economic', 'education', 'efficiency', 'electrical', 'energy', 'enterprises', 'environmentally', 'financial', 'ict', 'industrial', 'industrialisation', 'information', 'infrastructure', 'innovation', 'internet', 'irrigation', 'knowledge', 'markets', 'mobile', 'national', 'network', 'networks', 'phone', 'policy', 'power', 'public', 'quality', 'regional', 'research', 'resilient', 'resource', 'resources', 'roads', 'sanitation', 'scientific', 'security', 'service', 'services', 'society', 'sustainable', 'technological', 'technologies', 'technology', 'trade', 'transborder', 'transport', 'value', 'water']
sdg10 = ['affordable', 'age', 'ageism', 'aid', 'assistance', 'business', 'children', 'countries', 'culture', 'developing', 'development', 'disabilities', 'discrimination', 'discriminatory', 'economy', 'education', 'empower', 'equal', 'equality', 'equity', 'ethnicity', 'financial', 'foreign', 'gender', 'global', 'growth', 'health', 'homelessness', 'homophobia', 'housing', 'human', 'inclusion', 'income', 'indigenous', 'inequalities', 'inequality', 'investment', 'markets', 'migrant', 'migration', 'nations', 'opportunity', 'policy', 'population', 'poverty', 'protection', 'public', 'quality', 'race', 'racisim', 'reduce', 'religion', 'remittance', 'rights', 'rural', 'sex', 'sexism', 'social', 'society', 'states', 'trade', 'vulnerable', 'world']
sdg11 = ['adaptable', 'adaptation', 'affordable', 'air', 'buildings', 'building', 'change', 'cities', 'city', 'climate', 'communities', 'community', 'consumption', 'crowding', 'cultural', 'decentralisation', 'development', 'disaster', 'disasters', 'efficiency', 'fine', 'generation', 'green', 'growth', 'heritage', 'housing', 'human', 'impact', 'inadequate', 'informal', 'infrastructure', 'land', 'local', 'management', 'materials', 'matter', 'mitigation', 'natural', 'particulate', 'planning', 'pollution', 'population', 'public', 'quality', 'reduction', 'resilient', 'resource', 'risk', 'road', 'safe', 'safety', 'settlements', 'shanty', 'slums', 'smart', 'solid', 'spaces', 'strategy', 'suburban', 'sustainability', 'sustainable', 'systems', 'town', 'transport', 'urban', 'urbanisation', 'waste', 'water', 'water_related']
sdg12 = ['capitalism', 'cars', 'chain', 'chains', 'circular', 'commercial', 'consumer', 'consumerism', 'consumption', 'cycle', 'decarbonisation', 'development', 'distortions', 'ecological', 'economy', 'efficiency', 'efficient', 'energy', 'enterprises', 'food', 'fossil', 'fuel', 'future', 'gasses', 'generation', 'global', 'goods', 'greenhouse', 'harvest', 'industry', 'levels', 'losses', 'management', 'market', 'materialism', 'materials', 'monitoring', 'natural', 'obsolescence', 'overconsumption', 'pollution', 'practices', 'procurement', 'production', 'proof', 'public', 'recycle', 'recycling', 'reduce', 'reduction', 'renewable', 'resource', 'resources', 'responsible', 'retail', 'reuse', 'subsidies', 'supply', 'sustainable', 'tourism', 'vehicles', 'waste', 'wasteful', 'water']
sdg13 = ['action', 'adaptation', 'agreement', 'average', 'capture', 'carbon', 'change', 'changing', 'climate', 'co2', 'conversion', 'cop', 'dioxide', 'disasters', 'disease', 'economy', 'ecosystems', 'emissions', 'events', 'extreme', 'gas', 'gases', 'gender', 'global', 'greenhouse', 'hazards', 'ice', 'impact', 'infectious', 'management', 'mitigation', 'natural', 'ocean', 'paris', 'patterns', 'planning', 'policy', 'politics', 'pollution', 'refugees', 'related', 'renewable', 'resilience', 'rise', 'rising', 'sea', 'systems', 'temperature', 'warming', 'warning', 'weather']
sdg14 = ['acidification', 'areas', 'artisanal', 'biodiversity', 'bleaching', 'carbon', 'coastal', 'coastlines', 'conserve', 'coral', 'dioxide', 'ecosystem', 'ecosystems', 'fish', 'fisheries', 'fishers', 'fishing', 'global', 'grasses', 'habitats', 'illegal', 'kelp', 'law', 'management', 'marine', 'ocean', 'oceanography', 'oceans', 'overfishing', 'parks', 'policy', 'pollution', 'practices', 'productive', 'protected', 'reef', 'resources', 'sea', 'seas', 'species', 'stocks', 'sustainable', 'temperature', 'unregulated', 'warming', 'water']
sdg15 = ['afforestation', 'agriculture', 'alien', 'animals', 'arable', 'bees', 'biodiversity', 'conservation', 'deforestation', 'degradation', 'desertification', 'drought', 'drylands', 'ecosystem', 'ecosystems', 'extinct', 'extinction', 'fauna', 'flora', 'forest', 'forests', 'genetic', 'illegal', 'illicit', 'indigenous', 'invasive', 'land', 'manage', 'managed', 'management', 'microorganisms', 'permaculture', 'plan', 'plants', 'poaching', 'populations', 'poverty', 'products', 'protected', 'reforestation', 'resources', 'restoration', 'soil', 'species', 'strategic', 'sustainability', 'terrestrial', 'threatened', 'trafficking', 'tree', 'wetlands', 'wildlife']
sdg16 = ['abuse', 'access', 'accountability', 'accountable', 'against', 'arbitrary', 'arms', 'assets', 'birth', 'bribery', 'children', 'combat', 'conflict', 'conflicts', 'corruption', 'crime', 'decision_making', 'detainees', 'detention', 'disappearance', 'discrimination', 'displaced', 'education', 'enforced', 'equal', 'equity', 'evasion', 'exploitation', 'financial', 'freedom', 'geography', 'governance', 'hate', 'human', 'identity', 'illegal', 'illicit', 'inclusion', 'inclusive', 'institutions', 'internally', 'judiciary', 'justice', 'law', 'legal', 'life', 'national', 'nonviolence', 'organized', 'paris', 'peace', 'peaceful', 'physical', 'police', 'policy', 'poverty', 'prevent', 'principles', 'psychological', 'public', 'quality', 'rates', 'registration', 'representative', 'resolution', 'rights', 'rule', 'security', 'seizures', 'sexual', 'societies', 'societies', 'society', 'stolen', 'tax', 'terrorism', 'theft', 'threats', 'torture', 'trafficking', 'transparency', 'sentenced', 'unstable', 'victims', 'violence', 'weapon', 'women']
sdg17 = ['agenda', 'agreements', 'aid', 'assistance', 'average', 'building', 'capacity', 'census', 'civil', 'communication', 'cooperation', 'countries', 'data', 'debt', 'developing', 'development', 'disaggregated', 'doha', 'entrepreneurs', 'entrepreneurship', 'environmentally', 'eradication', 'foreign', 'fostering', 'fundamental', 'global', 'housing', 'innovation', 'international', 'investments', 'knowledge', 'stakeholder', 'official', 'organization', 'partnership', 'partnerships', 'population', 'poverty', 'principles', 'public', 'private', 'science', 'sharing', 'society', 'sound', 'stability', 'statistics', 'support', 'sustainability', 'sustainable', 'tariff', 'technologies', 'technology', 'trade', 'transfer', 'weighted', 'women', 'world']
sdg_list = [sdg1, sdg2, sdg3, sdg4, sdg5, sdg6, sdg7, sdg8, sdg9, sdg10, sdg11, sdg12, sdg13, sdg14, sdg15, sdg16, sdg17]

In [9]:
documents = data_lemmatized
sdg_l = sdg_list

score_vector1 = np.zeros((len(documents), len(sdg_l)))

for d, doc in tqdm(enumerate(documents)):
    d_len = len(documents[d])
    #print(d)
    for s, sdg in enumerate(sdg_l):
        s_len = len(sdg_l[s])
        score = 0
        for keyword in sdg:
            for word in doc:
                try:
                    sim = model.similarity(word, keyword)
                    if (sim >= 0.75):
                        score += sim 
                except:
                    pass
        score_vector1[d][s] = score / s_len

108it [03:31,  1.96s/it]


In [None]:
import numpy

project_sdg_df1 = pd.DataFrame(data=score_vector1)

columns = ['sdg1', 'sdg2', 'sdg3', 'sdg4', 'sdg5', 'sdg6', 'sdg7', 'sdg8', 'sdg9', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg17']

project_sdg_df1.columns = columns

### KEYWORD matching (word_count/doc_len)

In [10]:
#defining all 17 SDGs
sdg1 = ['africa', 'basic', 'class', 'countries', 'developing', 'disadvantaged', 'discrimination', 'distribution', 'economic', 'end', 'environment', 'equality', 'eradication', 'extreme', 'financial', 'inclusion', 'income', 'life', 'line', 'microfinance', 'poor', 'poverty', 'protection', 'quality', 'resources', 'services', 'social', 'sustainable', 'systems', 'third', 'vulnerable', 'wealth', 'world']
sdg2 = ['agriculture', 'diversity', 'food', 'genetic', 'genetics', 'growth', 'health', 'hunger', 'hungry', 'improved', 'infrastructure', 'innovations', 'legumes', 'life', 'maize', 'malnourished', 'malnutrition', 'needs', 'nourished', 'nutrition', 'nutritional', 'nutritious', 'people', 'poverty', 'produce', 'producers', 'production', 'productivity', 'quality', 'resilient', 'restrictions', 'rural', 'scale', 'seeds', 'small', 'stunted', 'stunting', 'sufficient', 'sustainable', 'trade', 'under', 'undernourished', 'wasting', 'world']
sdg3 = ['abuse', 'access', 'accidents', 'affordable', 'aids', 'air', 'alcohol', 'antenatal', 'antiretroviral', 'autonomy', 'biomedical', 'bodily', 'borne', 'care', 'child', 'clean', 'constrained', 'contamination', 'contraceptive', 'control', 'countries', 'coverage', 'death', 'deaths', 'density', 'dental', 'developing', 'disability', 'disease', 'diseases', 'drug', 'expectancy', 'family', 'health', 'healthcare', 'healthy', 'hepatitis', 'hiv', 'hygiene', 'improving', 'inclusion', 'increasing', 'indigenous', 'infected', 'international', 'life', 'lives', 'location', 'malaria', 'maternal', 'measles', 'medical', 'medicines', 'mental', 'mortality', 'narcotic', 'neonatal', 'organisation', 'planning', 'policy', 'polio', 'politics', 'pollution', 'premature', 'preventable', 'reducing', 'refugees', 'regulations', 'reproductive', 'resource', 'road', 'sanitation', 'services', 'sexual', 'soil', 'substance', 'support', 'therapy', 'tobacco', 'traffic', 'treatment', 'tuberculosis', 'universal', 'use', 'vaccines', 'violence', 'wash', 'water', 'well', 'wellbeing', 'worker', 'world']
sdg4 = ['access', 'basic', 'childhood', 'citizenship', 'cooperation', 'cultural', 'developing', 'development', 'disability', 'disparities', 'disparity', 'diversity', 'education', 'enrolment', 'equal', 'equality', 'equitable', 'equity', 'gender', 'global', 'inclusion', 'inclusive', 'innovation', 'international', 'learning', 'lifelong', 'literacy', 'numeracy', 'opportunities', 'preprimary', 'primary', 'qualified', 'refugees', 'rights', 'scholarships', 'school', 'secondary', 'sensitive', 'skills', 'sustainability', 'teacher', 'teachers', 'training', 'universal', 'vocational', 'vulnerable', 'women']
sdg5 = ['access', 'against', 'basic', 'coverage', 'dignity', 'disadvantaged', 'discrimination', 'employment', 'empower', 'empowerment', 'equal', 'equality', 'exploitation', 'female', 'feminism', 'forced', 'gender', 'genital', 'girls', 'governance', 'health', 'human', 'humanitarian', 'inclusion', 'living', 'marginalised', 'marriage', 'mutilation', 'opportunities', 'parity', 'pay', 'reproductive', 'rights', 'sexual', 'social', 'standards', 'trafficking', 'universal', 'violence', 'women', 'work', 'workplace']
sdg6 = ['access', 'accessible', 'affordable', 'aquifer', 'basins', 'cities', 'clean', 'contaminated', 'defecation', 'desalination', 'diarrhoeal', 'disasters', 'diseases', 'drinking', 'drought', 'dumping', 'ecosystem', 'ecosystems', 'efficiency', 'equitable', 'floods', 'harvesting', 'hydropower', 'hygiene', 'improving', 'inadequate', 'infrastructure', 'irrigation', 'lakes', 'latrines', 'management', 'pollution', 'protection', 'quality', 'recycled', 'resources', 'restoration', 'reuse', 'river', 'rivers', 'safe', 'sanitation', 'scarcity', 'sewerage', 'supply', 'sustainable', 'third', 'toilets', 'treatment', 'untreated', 'urban', 'waste', 'wastewater', 'water', 'water_related', 'water_use', 'withdrawals', 'world']
sdg7 = ['affordable', 'alternative', 'animal', 'battery', 'carbon', 'charcoal', 'clean', 'cleaner', 'climate', 'coal', 'economy', 'efficiency', 'electricity', 'emissions', 'energy', 'fossil', 'fossil_fuel', 'fuel', 'fuels', 'gas', 'goal', 'green', 'greenhouse', 'hydroelectric', 'infrastructure', 'modern', 'power', 'reliable', 'renewable', 'research', 'services', 'solar', 'sustainable', 'technology', 'turbine', 'vehicles', 'waste', 'wave', 'wind', 'wood']
sdg8 = ['aid', 'banking', 'child', 'consumption', 'creation', 'creativity', 'culture', 'development', 'domestic', 'economic', 'economy', 'efficiency', 'employment', 'enterprises', 'entrepreneurship', 'equal', 'eradication', 'finance', 'financial', 'forced', 'gdp', 'global', 'gross', 'growth', 'human', 'inclusive', 'innovation', 'insurance', 'job', 'jobs', 'labour', 'life', 'market', 'micro', 'migrant', 'migrants', 'modern', 'opportunities', 'oriented', 'pay', 'policies', 'policy', 'poverty', 'product', 'production', 'productive', 'productivity', 'public', 'quality', 'resource', 'rights', 'safe', 'secure', 'services', 'slavery', 'social', 'society', 'soldiers', 'stable', 'sustainable', 'tourism', 'trade', 'trafficking', 'unemployment', 'paid', 'women', 'work', 'workers', 'world', 'youth']
sdg9 = ['access', 'affordable', 'banks', 'capabilities', 'chains', 'clean', 'communication', 'cooperation', 'countries', 'credit', 'data', 'developing', 'development', 'diversification', 'economic', 'education', 'efficiency', 'electrical', 'energy', 'enterprises', 'environmentally', 'financial', 'ict', 'industrial', 'industrialisation', 'information', 'infrastructure', 'innovation', 'internet', 'irrigation', 'knowledge', 'markets', 'mobile', 'national', 'network', 'networks', 'phone', 'policy', 'power', 'public', 'quality', 'regional', 'research', 'resilient', 'resource', 'resources', 'roads', 'sanitation', 'scientific', 'security', 'service', 'services', 'society', 'sustainable', 'technological', 'technologies', 'technology', 'trade', 'transborder', 'transport', 'value', 'water']
sdg10 = ['affordable', 'age', 'ageism', 'aid', 'assistance', 'business', 'children', 'countries', 'culture', 'developing', 'development', 'disabilities', 'discrimination', 'discriminatory', 'economy', 'education', 'empower', 'equal', 'equality', 'equity', 'ethnicity', 'financial', 'foreign', 'gender', 'global', 'growth', 'health', 'homelessness', 'homophobia', 'housing', 'human', 'inclusion', 'income', 'indigenous', 'inequalities', 'inequality', 'investment', 'markets', 'migrant', 'migration', 'nations', 'opportunity', 'policy', 'population', 'poverty', 'protection', 'public', 'quality', 'race', 'racisim', 'reduce', 'religion', 'remittance', 'rights', 'rural', 'sex', 'sexism', 'social', 'society', 'states', 'trade', 'vulnerable', 'world']
sdg11 = ['adaptable', 'adaptation', 'affordable', 'air', 'buildings', 'building', 'change', 'cities', 'city', 'climate', 'communities', 'community', 'consumption', 'crowding', 'cultural', 'decentralisation', 'development', 'disaster', 'disasters', 'efficiency', 'fine', 'generation', 'green', 'growth', 'heritage', 'housing', 'human', 'impact', 'inadequate', 'informal', 'infrastructure', 'land', 'local', 'management', 'materials', 'matter', 'mitigation', 'natural', 'particulate', 'planning', 'pollution', 'population', 'public', 'quality', 'reduction', 'resilient', 'resource', 'risk', 'road', 'safe', 'safety', 'settlements', 'shanty', 'slums', 'smart', 'solid', 'spaces', 'strategy', 'suburban', 'sustainability', 'sustainable', 'systems', 'town', 'transport', 'urban', 'urbanisation', 'waste', 'water', 'water_related']
sdg12 = ['capitalism', 'cars', 'chain', 'chains', 'circular', 'commercial', 'consumer', 'consumerism', 'consumption', 'cycle', 'decarbonisation', 'development', 'distortions', 'ecological', 'economy', 'efficiency', 'efficient', 'energy', 'enterprises', 'food', 'fossil', 'fuel', 'future', 'gasses', 'generation', 'global', 'goods', 'greenhouse', 'harvest', 'industry', 'levels', 'losses', 'management', 'market', 'materialism', 'materials', 'monitoring', 'natural', 'obsolescence', 'overconsumption', 'pollution', 'practices', 'procurement', 'production', 'proof', 'public', 'recycle', 'recycling', 'reduce', 'reduction', 'renewable', 'resource', 'resources', 'responsible', 'retail', 'reuse', 'subsidies', 'supply', 'sustainable', 'tourism', 'vehicles', 'waste', 'wasteful', 'water']
sdg13 = ['action', 'adaptation', 'agreement', 'average', 'capture', 'carbon', 'change', 'changing', 'climate', 'co2', 'conversion', 'cop', 'dioxide', 'disasters', 'disease', 'economy', 'ecosystems', 'emissions', 'events', 'extreme', 'gas', 'gases', 'gender', 'global', 'greenhouse', 'hazards', 'ice', 'impact', 'infectious', 'management', 'mitigation', 'natural', 'ocean', 'paris', 'patterns', 'planning', 'policy', 'politics', 'pollution', 'refugees', 'related', 'renewable', 'resilience', 'rise', 'rising', 'sea', 'systems', 'temperature', 'warming', 'warning', 'weather']
sdg14 = ['acidification', 'areas', 'artisanal', 'biodiversity', 'bleaching', 'carbon', 'coastal', 'coastlines', 'conserve', 'coral', 'dioxide', 'ecosystem', 'ecosystems', 'fish', 'fisheries', 'fishers', 'fishing', 'global', 'grasses', 'habitats', 'illegal', 'kelp', 'law', 'management', 'marine', 'ocean', 'oceanography', 'oceans', 'overfishing', 'parks', 'policy', 'pollution', 'practices', 'productive', 'protected', 'reef', 'resources', 'sea', 'seas', 'species', 'stocks', 'sustainable', 'temperature', 'unregulated', 'warming', 'water']
sdg15 = ['afforestation', 'agriculture', 'alien', 'animals', 'arable', 'bees', 'biodiversity', 'conservation', 'deforestation', 'degradation', 'desertification', 'drought', 'drylands', 'ecosystem', 'ecosystems', 'extinct', 'extinction', 'fauna', 'flora', 'forest', 'forests', 'genetic', 'illegal', 'illicit', 'indigenous', 'invasive', 'land', 'manage', 'managed', 'management', 'microorganisms', 'permaculture', 'plan', 'plants', 'poaching', 'populations', 'poverty', 'products', 'protected', 'reforestation', 'resources', 'restoration', 'soil', 'species', 'strategic', 'sustainability', 'terrestrial', 'threatened', 'trafficking', 'tree', 'wetlands', 'wildlife']
sdg16 = ['abuse', 'access', 'accountability', 'accountable', 'against', 'arbitrary', 'arms', 'assets', 'birth', 'bribery', 'children', 'combat', 'conflict', 'conflicts', 'corruption', 'crime', 'decision_making', 'detainees', 'detention', 'disappearance', 'discrimination', 'displaced', 'education', 'enforced', 'equal', 'equity', 'evasion', 'exploitation', 'financial', 'freedom', 'geography', 'governance', 'hate', 'human', 'identity', 'illegal', 'illicit', 'inclusion', 'inclusive', 'institutions', 'internally', 'judiciary', 'justice', 'law', 'legal', 'life', 'national', 'nonviolence', 'organized', 'paris', 'peace', 'peaceful', 'physical', 'police', 'policy', 'poverty', 'prevent', 'principles', 'psychological', 'public', 'quality', 'rates', 'registration', 'representative', 'resolution', 'rights', 'rule', 'security', 'seizures', 'sexual', 'societies', 'societies', 'society', 'stolen', 'tax', 'terrorism', 'theft', 'threats', 'torture', 'trafficking', 'transparency', 'sentenced', 'unstable', 'victims', 'violence', 'weapon', 'women']
sdg17 = ['agenda', 'agreements', 'aid', 'assistance', 'average', 'building', 'capacity', 'census', 'civil', 'communication', 'cooperation', 'countries', 'data', 'debt', 'developing', 'development', 'disaggregated', 'doha', 'entrepreneurs', 'entrepreneurship', 'environmentally', 'eradication', 'foreign', 'fostering', 'fundamental', 'global', 'housing', 'innovation', 'international', 'investments', 'knowledge', 'stakeholder', 'official', 'organization', 'partnership', 'partnerships', 'population', 'poverty', 'principles', 'public', 'private', 'science', 'sharing', 'society', 'sound', 'stability', 'statistics', 'support', 'sustainability', 'sustainable', 'tariff', 'technologies', 'technology', 'trade', 'transfer', 'weighted', 'women', 'world']
sdg_list = [sdg1, sdg2, sdg3, sdg4, sdg5, sdg6, sdg7, sdg8, sdg9, sdg10, sdg11, sdg12, sdg13, sdg14, sdg15, sdg16, sdg17]

In [11]:
#sdg bag of words
sdg_bow = [id2word.doc2bow(text) for text in sdg_list]

In [12]:
# vectorized the sdg bag of words into a vector
sdg_vectorized = np.zeros((len(sdg_list), len(id2word)))
for i, sdg_bow_i in enumerate(sdg_bow):
    for id, count in sdg_bow_i:
        sdg_vectorized[i][id] = count

In [14]:
#vectorized each document (bag of words) into a vector
corpus_vectorized = np.zeros((len(corpus), len(id2word)))
for i, corpus_i in enumerate(corpus):
    for id, count in corpus_i:
        corpus_vectorized[i][id] = count

In [25]:
#total sdg words matched in a document
sdg_word_match = np.dot(corpus_vectorized,sdg_vectorized.transpose())

#total number of words in each document
total_words = sdg_vectorized.sum(axis=1)

score_vector1 = list()
#filename_list = list()

# probability estimation
for i, corpus_i in enumerate(total_words):
    score_vector1.append(sdg_word_match.T[i]/corpus_i)



# Score Normalization

In [31]:
project_sdg_df1 = pd.DataFrame(data=score_vector1)

#rename the columns
columns = ['sdg1', 'sdg2', 'sdg3', 'sdg4', 'sdg5', 'sdg6', 'sdg7', 'sdg8', 'sdg9', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg17']
project_sdg_df1.columns = columns

In [32]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

norm_sdg_df1 = pd.DataFrame(min_max_scaler.fit_transform(project_sdg_df1.T).T)

In [33]:
doc_num = 108
i = 0
cutoff = 0.8
sdg_labels = list()

for i in range(doc_num):
    th1 = list(norm_sdg_df1.iloc[i] >= cutoff)
    sdg_labels.append(th1)

In [34]:
sdg_labels_df1 = pd.DataFrame(data=sdg_labels)
sdg_labels_df1.replace(False, '0', inplace=True)
sdg_labels_df1.replace(True, '1', inplace=True)

sdg_labels_df1.columns = columns
sdg_labels_df1['Project Name'] = df1['Project Name']

columns = ['Project Name', 'sdg1', 'sdg2', 'sdg3', 'sdg4', 'sdg5', 'sdg6', 'sdg7', 'sdg8', 'sdg9', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg17']
sdg_labels_df1 = sdg_labels_df1[columns]

sdg_labels_df1

Unnamed: 0,Project Name,sdg1,sdg2,sdg3,sdg4,sdg5,sdg6,sdg7,sdg8,sdg9,sdg10,sdg11,sdg12,sdg13,sdg14,sdg15,sdg16,sdg17
0,DYMAXION LABS,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
1,Layers against inequality,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,ParaEmpleo,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,Support to Workers and Migrants,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,MERON,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,De-Enigma Project,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0
104,DiDi Smart Transportation Brain / DiDi AI Labs,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
105,Spark,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
106,Dynamic and Robust Wildfire Risk Prediction Sy...,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


# Save results to file

In [35]:
sdg_labels_df1.to_excel("SDG_projects_KM_80.xlsx", engine='xlsxwriter')

# Load curricula

In [3]:
df2 = pd.read_excel('data/all_curricula.xlsx')

In [4]:
len(df2)

166

In [5]:
corpus, id2word, data_lemmatized = AI_Frameworks_LDA.preprocessData(df2)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/osama.nasir@ebryx.com/anaconda3/envs/myenv/lib/python3.7/site-packages/en_core_web_sm
-->
/home/osama.nasir@ebryx.com/anaconda3/envs/myenv/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [6]:
print(len(corpus))
print(len(id2word))
print(len(data_lemmatized))

166
5240
166


# SDG Model

In [7]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-100")

In [8]:
#defining all 17 SDGs
sdg1 = ['africa', 'basic', 'class', 'countries', 'developing', 'disadvantaged', 'discrimination', 'distribution', 'economic', 'end', 'environment', 'equality', 'eradication', 'extreme', 'financial', 'inclusion', 'income', 'life', 'line', 'microfinance', 'poor', 'poverty', 'protection', 'quality', 'resources', 'services', 'social', 'sustainable', 'systems', 'third', 'vulnerable', 'wealth', 'world']
sdg2 = ['agriculture', 'diversity', 'food', 'genetic', 'genetics', 'growth', 'health', 'hunger', 'hungry', 'improved', 'infrastructure', 'innovations', 'legumes', 'life', 'maize', 'malnourished', 'malnutrition', 'needs', 'nourished', 'nutrition', 'nutritional', 'nutritious', 'people', 'poverty', 'produce', 'producers', 'production', 'productivity', 'quality', 'resilient', 'restrictions', 'rural', 'scale', 'seeds', 'small', 'stunted', 'stunting', 'sufficient', 'sustainable', 'trade', 'under', 'undernourished', 'wasting', 'world']
sdg3 = ['abuse', 'access', 'accidents', 'affordable', 'aids', 'air', 'alcohol', 'antenatal', 'antiretroviral', 'autonomy', 'biomedical', 'bodily', 'borne', 'care', 'child', 'clean', 'constrained', 'contamination', 'contraceptive', 'control', 'countries', 'coverage', 'death', 'deaths', 'density', 'dental', 'developing', 'disability', 'disease', 'diseases', 'drug', 'expectancy', 'family', 'health', 'healthcare', 'healthy', 'hepatitis', 'hiv', 'hygiene', 'improving', 'inclusion', 'increasing', 'indigenous', 'infected', 'international', 'life', 'lives', 'location', 'malaria', 'maternal', 'measles', 'medical', 'medicines', 'mental', 'mortality', 'narcotic', 'neonatal', 'organisation', 'planning', 'policy', 'polio', 'politics', 'pollution', 'premature', 'preventable', 'reducing', 'refugees', 'regulations', 'reproductive', 'resource', 'road', 'sanitation', 'services', 'sexual', 'soil', 'substance', 'support', 'therapy', 'tobacco', 'traffic', 'treatment', 'tuberculosis', 'universal', 'use', 'vaccines', 'violence', 'wash', 'water', 'well', 'wellbeing', 'worker', 'world']
sdg4 = ['access', 'basic', 'childhood', 'citizenship', 'cooperation', 'cultural', 'developing', 'development', 'disability', 'disparities', 'disparity', 'diversity', 'education', 'enrolment', 'equal', 'equality', 'equitable', 'equity', 'gender', 'global', 'inclusion', 'inclusive', 'innovation', 'international', 'learning', 'lifelong', 'literacy', 'numeracy', 'opportunities', 'preprimary', 'primary', 'qualified', 'refugees', 'rights', 'scholarships', 'school', 'secondary', 'sensitive', 'skills', 'sustainability', 'teacher', 'teachers', 'training', 'universal', 'vocational', 'vulnerable', 'women']
sdg5 = ['access', 'against', 'basic', 'coverage', 'dignity', 'disadvantaged', 'discrimination', 'employment', 'empower', 'empowerment', 'equal', 'equality', 'exploitation', 'female', 'feminism', 'forced', 'gender', 'genital', 'girls', 'governance', 'health', 'human', 'humanitarian', 'inclusion', 'living', 'marginalised', 'marriage', 'mutilation', 'opportunities', 'parity', 'pay', 'reproductive', 'rights', 'sexual', 'social', 'standards', 'trafficking', 'universal', 'violence', 'women', 'work', 'workplace']
sdg6 = ['access', 'accessible', 'affordable', 'aquifer', 'basins', 'cities', 'clean', 'contaminated', 'defecation', 'desalination', 'diarrhoeal', 'disasters', 'diseases', 'drinking', 'drought', 'dumping', 'ecosystem', 'ecosystems', 'efficiency', 'equitable', 'floods', 'harvesting', 'hydropower', 'hygiene', 'improving', 'inadequate', 'infrastructure', 'irrigation', 'lakes', 'latrines', 'management', 'pollution', 'protection', 'quality', 'recycled', 'resources', 'restoration', 'reuse', 'river', 'rivers', 'safe', 'sanitation', 'scarcity', 'sewerage', 'supply', 'sustainable', 'third', 'toilets', 'treatment', 'untreated', 'urban', 'waste', 'wastewater', 'water', 'water_related', 'water_use', 'withdrawals', 'world']
sdg7 = ['affordable', 'alternative', 'animal', 'battery', 'carbon', 'charcoal', 'clean', 'cleaner', 'climate', 'coal', 'economy', 'efficiency', 'electricity', 'emissions', 'energy', 'fossil', 'fossil_fuel', 'fuel', 'fuels', 'gas', 'goal', 'green', 'greenhouse', 'hydroelectric', 'infrastructure', 'modern', 'power', 'reliable', 'renewable', 'research', 'services', 'solar', 'sustainable', 'technology', 'turbine', 'vehicles', 'waste', 'wave', 'wind', 'wood']
sdg8 = ['aid', 'banking', 'child', 'consumption', 'creation', 'creativity', 'culture', 'development', 'domestic', 'economic', 'economy', 'efficiency', 'employment', 'enterprises', 'entrepreneurship', 'equal', 'eradication', 'finance', 'financial', 'forced', 'gdp', 'global', 'gross', 'growth', 'human', 'inclusive', 'innovation', 'insurance', 'job', 'jobs', 'labour', 'life', 'market', 'micro', 'migrant', 'migrants', 'modern', 'opportunities', 'oriented', 'pay', 'policies', 'policy', 'poverty', 'product', 'production', 'productive', 'productivity', 'public', 'quality', 'resource', 'rights', 'safe', 'secure', 'services', 'slavery', 'social', 'society', 'soldiers', 'stable', 'sustainable', 'tourism', 'trade', 'trafficking', 'unemployment', 'paid', 'women', 'work', 'workers', 'world', 'youth']
sdg9 = ['access', 'affordable', 'banks', 'capabilities', 'chains', 'clean', 'communication', 'cooperation', 'countries', 'credit', 'data', 'developing', 'development', 'diversification', 'economic', 'education', 'efficiency', 'electrical', 'energy', 'enterprises', 'environmentally', 'financial', 'ict', 'industrial', 'industrialisation', 'information', 'infrastructure', 'innovation', 'internet', 'irrigation', 'knowledge', 'markets', 'mobile', 'national', 'network', 'networks', 'phone', 'policy', 'power', 'public', 'quality', 'regional', 'research', 'resilient', 'resource', 'resources', 'roads', 'sanitation', 'scientific', 'security', 'service', 'services', 'society', 'sustainable', 'technological', 'technologies', 'technology', 'trade', 'transborder', 'transport', 'value', 'water']
sdg10 = ['affordable', 'age', 'ageism', 'aid', 'assistance', 'business', 'children', 'countries', 'culture', 'developing', 'development', 'disabilities', 'discrimination', 'discriminatory', 'economy', 'education', 'empower', 'equal', 'equality', 'equity', 'ethnicity', 'financial', 'foreign', 'gender', 'global', 'growth', 'health', 'homelessness', 'homophobia', 'housing', 'human', 'inclusion', 'income', 'indigenous', 'inequalities', 'inequality', 'investment', 'markets', 'migrant', 'migration', 'nations', 'opportunity', 'policy', 'population', 'poverty', 'protection', 'public', 'quality', 'race', 'racisim', 'reduce', 'religion', 'remittance', 'rights', 'rural', 'sex', 'sexism', 'social', 'society', 'states', 'trade', 'vulnerable', 'world']
sdg11 = ['adaptable', 'adaptation', 'affordable', 'air', 'buildings', 'building', 'change', 'cities', 'city', 'climate', 'communities', 'community', 'consumption', 'crowding', 'cultural', 'decentralisation', 'development', 'disaster', 'disasters', 'efficiency', 'fine', 'generation', 'green', 'growth', 'heritage', 'housing', 'human', 'impact', 'inadequate', 'informal', 'infrastructure', 'land', 'local', 'management', 'materials', 'matter', 'mitigation', 'natural', 'particulate', 'planning', 'pollution', 'population', 'public', 'quality', 'reduction', 'resilient', 'resource', 'risk', 'road', 'safe', 'safety', 'settlements', 'shanty', 'slums', 'smart', 'solid', 'spaces', 'strategy', 'suburban', 'sustainability', 'sustainable', 'systems', 'town', 'transport', 'urban', 'urbanisation', 'waste', 'water', 'water_related']
sdg12 = ['capitalism', 'cars', 'chain', 'chains', 'circular', 'commercial', 'consumer', 'consumerism', 'consumption', 'cycle', 'decarbonisation', 'development', 'distortions', 'ecological', 'economy', 'efficiency', 'efficient', 'energy', 'enterprises', 'food', 'fossil', 'fuel', 'future', 'gasses', 'generation', 'global', 'goods', 'greenhouse', 'harvest', 'industry', 'levels', 'losses', 'management', 'market', 'materialism', 'materials', 'monitoring', 'natural', 'obsolescence', 'overconsumption', 'pollution', 'practices', 'procurement', 'production', 'proof', 'public', 'recycle', 'recycling', 'reduce', 'reduction', 'renewable', 'resource', 'resources', 'responsible', 'retail', 'reuse', 'subsidies', 'supply', 'sustainable', 'tourism', 'vehicles', 'waste', 'wasteful', 'water']
sdg13 = ['action', 'adaptation', 'agreement', 'average', 'capture', 'carbon', 'change', 'changing', 'climate', 'co2', 'conversion', 'cop', 'dioxide', 'disasters', 'disease', 'economy', 'ecosystems', 'emissions', 'events', 'extreme', 'gas', 'gases', 'gender', 'global', 'greenhouse', 'hazards', 'ice', 'impact', 'infectious', 'management', 'mitigation', 'natural', 'ocean', 'paris', 'patterns', 'planning', 'policy', 'politics', 'pollution', 'refugees', 'related', 'renewable', 'resilience', 'rise', 'rising', 'sea', 'systems', 'temperature', 'warming', 'warning', 'weather']
sdg14 = ['acidification', 'areas', 'artisanal', 'biodiversity', 'bleaching', 'carbon', 'coastal', 'coastlines', 'conserve', 'coral', 'dioxide', 'ecosystem', 'ecosystems', 'fish', 'fisheries', 'fishers', 'fishing', 'global', 'grasses', 'habitats', 'illegal', 'kelp', 'law', 'management', 'marine', 'ocean', 'oceanography', 'oceans', 'overfishing', 'parks', 'policy', 'pollution', 'practices', 'productive', 'protected', 'reef', 'resources', 'sea', 'seas', 'species', 'stocks', 'sustainable', 'temperature', 'unregulated', 'warming', 'water']
sdg15 = ['afforestation', 'agriculture', 'alien', 'animals', 'arable', 'bees', 'biodiversity', 'conservation', 'deforestation', 'degradation', 'desertification', 'drought', 'drylands', 'ecosystem', 'ecosystems', 'extinct', 'extinction', 'fauna', 'flora', 'forest', 'forests', 'genetic', 'illegal', 'illicit', 'indigenous', 'invasive', 'land', 'manage', 'managed', 'management', 'microorganisms', 'permaculture', 'plan', 'plants', 'poaching', 'populations', 'poverty', 'products', 'protected', 'reforestation', 'resources', 'restoration', 'soil', 'species', 'strategic', 'sustainability', 'terrestrial', 'threatened', 'trafficking', 'tree', 'wetlands', 'wildlife']
sdg16 = ['abuse', 'access', 'accountability', 'accountable', 'against', 'arbitrary', 'arms', 'assets', 'birth', 'bribery', 'children', 'combat', 'conflict', 'conflicts', 'corruption', 'crime', 'decision_making', 'detainees', 'detention', 'disappearance', 'discrimination', 'displaced', 'education', 'enforced', 'equal', 'equity', 'evasion', 'exploitation', 'financial', 'freedom', 'geography', 'governance', 'hate', 'human', 'identity', 'illegal', 'illicit', 'inclusion', 'inclusive', 'institutions', 'internally', 'judiciary', 'justice', 'law', 'legal', 'life', 'national', 'nonviolence', 'organized', 'paris', 'peace', 'peaceful', 'physical', 'police', 'policy', 'poverty', 'prevent', 'principles', 'psychological', 'public', 'quality', 'rates', 'registration', 'representative', 'resolution', 'rights', 'rule', 'security', 'seizures', 'sexual', 'societies', 'societies', 'society', 'stolen', 'tax', 'terrorism', 'theft', 'threats', 'torture', 'trafficking', 'transparency', 'sentenced', 'unstable', 'victims', 'violence', 'weapon', 'women']
sdg17 = ['agenda', 'agreements', 'aid', 'assistance', 'average', 'building', 'capacity', 'census', 'civil', 'communication', 'cooperation', 'countries', 'data', 'debt', 'developing', 'development', 'disaggregated', 'doha', 'entrepreneurs', 'entrepreneurship', 'environmentally', 'eradication', 'foreign', 'fostering', 'fundamental', 'global', 'housing', 'innovation', 'international', 'investments', 'knowledge', 'stakeholder', 'official', 'organization', 'partnership', 'partnerships', 'population', 'poverty', 'principles', 'public', 'private', 'science', 'sharing', 'society', 'sound', 'stability', 'statistics', 'support', 'sustainability', 'sustainable', 'tariff', 'technologies', 'technology', 'trade', 'transfer', 'weighted', 'women', 'world']
sdg_list = [sdg1, sdg2, sdg3, sdg4, sdg5, sdg6, sdg7, sdg8, sdg9, sdg10, sdg11, sdg12, sdg13, sdg14, sdg15, sdg16, sdg17]

In [None]:
documents = data_lemmatized
sdg_l = sdg_list

score_vector2 = np.zeros((len(documents), len(sdg_l)))

for d, doc in tqdm(enumerate(documents)):
    d_len = len(documents[d])
    #print(d)
    for s, sdg in enumerate(sdg_l):
        s_len = len(sdg_l[s])
        score = 0
        for keyword in sdg:
            for word in doc:
                try:
                    sim = model.similarity(word, keyword)
                    if (sim >= 0.75):
                        score += sim 
                except:
                    pass
        score_vector2[d][s] = score / s_len

26it [06:43, 11.82s/it]

### KEYWORD matching (word_count/sdg_len)

In [9]:
#defining all 17 SDGs
sdg1 = ['africa', 'basic', 'class', 'countries', 'developing', 'disadvantaged', 'discrimination', 'distribution', 'economic', 'end', 'environment', 'equality', 'eradication', 'extreme', 'financial', 'inclusion', 'income', 'life', 'line', 'microfinance', 'poor', 'poverty', 'protection', 'quality', 'resources', 'services', 'social', 'sustainable', 'systems', 'third', 'vulnerable', 'wealth', 'world']
sdg2 = ['agriculture', 'diversity', 'food', 'genetic', 'genetics', 'growth', 'health', 'hunger', 'hungry', 'improved', 'infrastructure', 'innovations', 'legumes', 'life', 'maize', 'malnourished', 'malnutrition', 'needs', 'nourished', 'nutrition', 'nutritional', 'nutritious', 'people', 'poverty', 'produce', 'producers', 'production', 'productivity', 'quality', 'resilient', 'restrictions', 'rural', 'scale', 'seeds', 'small', 'stunted', 'stunting', 'sufficient', 'sustainable', 'trade', 'under', 'undernourished', 'wasting', 'world']
sdg3 = ['abuse', 'access', 'accidents', 'affordable', 'aids', 'air', 'alcohol', 'antenatal', 'antiretroviral', 'autonomy', 'biomedical', 'bodily', 'borne', 'care', 'child', 'clean', 'constrained', 'contamination', 'contraceptive', 'control', 'countries', 'coverage', 'death', 'deaths', 'density', 'dental', 'developing', 'disability', 'disease', 'diseases', 'drug', 'expectancy', 'family', 'health', 'healthcare', 'healthy', 'hepatitis', 'hiv', 'hygiene', 'improving', 'inclusion', 'increasing', 'indigenous', 'infected', 'international', 'life', 'lives', 'location', 'malaria', 'maternal', 'measles', 'medical', 'medicines', 'mental', 'mortality', 'narcotic', 'neonatal', 'organisation', 'planning', 'policy', 'polio', 'politics', 'pollution', 'premature', 'preventable', 'reducing', 'refugees', 'regulations', 'reproductive', 'resource', 'road', 'sanitation', 'services', 'sexual', 'soil', 'substance', 'support', 'therapy', 'tobacco', 'traffic', 'treatment', 'tuberculosis', 'universal', 'use', 'vaccines', 'violence', 'wash', 'water', 'well', 'wellbeing', 'worker', 'world']
sdg4 = ['access', 'basic', 'childhood', 'citizenship', 'cooperation', 'cultural', 'developing', 'development', 'disability', 'disparities', 'disparity', 'diversity', 'education', 'enrolment', 'equal', 'equality', 'equitable', 'equity', 'gender', 'global', 'inclusion', 'inclusive', 'innovation', 'international', 'learning', 'lifelong', 'literacy', 'numeracy', 'opportunities', 'preprimary', 'primary', 'qualified', 'refugees', 'rights', 'scholarships', 'school', 'secondary', 'sensitive', 'skills', 'sustainability', 'teacher', 'teachers', 'training', 'universal', 'vocational', 'vulnerable', 'women']
sdg5 = ['access', 'against', 'basic', 'coverage', 'dignity', 'disadvantaged', 'discrimination', 'employment', 'empower', 'empowerment', 'equal', 'equality', 'exploitation', 'female', 'feminism', 'forced', 'gender', 'genital', 'girls', 'governance', 'health', 'human', 'humanitarian', 'inclusion', 'living', 'marginalised', 'marriage', 'mutilation', 'opportunities', 'parity', 'pay', 'reproductive', 'rights', 'sexual', 'social', 'standards', 'trafficking', 'universal', 'violence', 'women', 'work', 'workplace']
sdg6 = ['access', 'accessible', 'affordable', 'aquifer', 'basins', 'cities', 'clean', 'contaminated', 'defecation', 'desalination', 'diarrhoeal', 'disasters', 'diseases', 'drinking', 'drought', 'dumping', 'ecosystem', 'ecosystems', 'efficiency', 'equitable', 'floods', 'harvesting', 'hydropower', 'hygiene', 'improving', 'inadequate', 'infrastructure', 'irrigation', 'lakes', 'latrines', 'management', 'pollution', 'protection', 'quality', 'recycled', 'resources', 'restoration', 'reuse', 'river', 'rivers', 'safe', 'sanitation', 'scarcity', 'sewerage', 'supply', 'sustainable', 'third', 'toilets', 'treatment', 'untreated', 'urban', 'waste', 'wastewater', 'water', 'water_related', 'water_use', 'withdrawals', 'world']
sdg7 = ['affordable', 'alternative', 'animal', 'battery', 'carbon', 'charcoal', 'clean', 'cleaner', 'climate', 'coal', 'economy', 'efficiency', 'electricity', 'emissions', 'energy', 'fossil', 'fossil_fuel', 'fuel', 'fuels', 'gas', 'goal', 'green', 'greenhouse', 'hydroelectric', 'infrastructure', 'modern', 'power', 'reliable', 'renewable', 'research', 'services', 'solar', 'sustainable', 'technology', 'turbine', 'vehicles', 'waste', 'wave', 'wind', 'wood']
sdg8 = ['aid', 'banking', 'child', 'consumption', 'creation', 'creativity', 'culture', 'development', 'domestic', 'economic', 'economy', 'efficiency', 'employment', 'enterprises', 'entrepreneurship', 'equal', 'eradication', 'finance', 'financial', 'forced', 'gdp', 'global', 'gross', 'growth', 'human', 'inclusive', 'innovation', 'insurance', 'job', 'jobs', 'labour', 'life', 'market', 'micro', 'migrant', 'migrants', 'modern', 'opportunities', 'oriented', 'pay', 'policies', 'policy', 'poverty', 'product', 'production', 'productive', 'productivity', 'public', 'quality', 'resource', 'rights', 'safe', 'secure', 'services', 'slavery', 'social', 'society', 'soldiers', 'stable', 'sustainable', 'tourism', 'trade', 'trafficking', 'unemployment', 'paid', 'women', 'work', 'workers', 'world', 'youth']
sdg9 = ['access', 'affordable', 'banks', 'capabilities', 'chains', 'clean', 'communication', 'cooperation', 'countries', 'credit', 'data', 'developing', 'development', 'diversification', 'economic', 'education', 'efficiency', 'electrical', 'energy', 'enterprises', 'environmentally', 'financial', 'ict', 'industrial', 'industrialisation', 'information', 'infrastructure', 'innovation', 'internet', 'irrigation', 'knowledge', 'markets', 'mobile', 'national', 'network', 'networks', 'phone', 'policy', 'power', 'public', 'quality', 'regional', 'research', 'resilient', 'resource', 'resources', 'roads', 'sanitation', 'scientific', 'security', 'service', 'services', 'society', 'sustainable', 'technological', 'technologies', 'technology', 'trade', 'transborder', 'transport', 'value', 'water']
sdg10 = ['affordable', 'age', 'ageism', 'aid', 'assistance', 'business', 'children', 'countries', 'culture', 'developing', 'development', 'disabilities', 'discrimination', 'discriminatory', 'economy', 'education', 'empower', 'equal', 'equality', 'equity', 'ethnicity', 'financial', 'foreign', 'gender', 'global', 'growth', 'health', 'homelessness', 'homophobia', 'housing', 'human', 'inclusion', 'income', 'indigenous', 'inequalities', 'inequality', 'investment', 'markets', 'migrant', 'migration', 'nations', 'opportunity', 'policy', 'population', 'poverty', 'protection', 'public', 'quality', 'race', 'racisim', 'reduce', 'religion', 'remittance', 'rights', 'rural', 'sex', 'sexism', 'social', 'society', 'states', 'trade', 'vulnerable', 'world']
sdg11 = ['adaptable', 'adaptation', 'affordable', 'air', 'buildings', 'building', 'change', 'cities', 'city', 'climate', 'communities', 'community', 'consumption', 'crowding', 'cultural', 'decentralisation', 'development', 'disaster', 'disasters', 'efficiency', 'fine', 'generation', 'green', 'growth', 'heritage', 'housing', 'human', 'impact', 'inadequate', 'informal', 'infrastructure', 'land', 'local', 'management', 'materials', 'matter', 'mitigation', 'natural', 'particulate', 'planning', 'pollution', 'population', 'public', 'quality', 'reduction', 'resilient', 'resource', 'risk', 'road', 'safe', 'safety', 'settlements', 'shanty', 'slums', 'smart', 'solid', 'spaces', 'strategy', 'suburban', 'sustainability', 'sustainable', 'systems', 'town', 'transport', 'urban', 'urbanisation', 'waste', 'water', 'water_related']
sdg12 = ['capitalism', 'cars', 'chain', 'chains', 'circular', 'commercial', 'consumer', 'consumerism', 'consumption', 'cycle', 'decarbonisation', 'development', 'distortions', 'ecological', 'economy', 'efficiency', 'efficient', 'energy', 'enterprises', 'food', 'fossil', 'fuel', 'future', 'gasses', 'generation', 'global', 'goods', 'greenhouse', 'harvest', 'industry', 'levels', 'losses', 'management', 'market', 'materialism', 'materials', 'monitoring', 'natural', 'obsolescence', 'overconsumption', 'pollution', 'practices', 'procurement', 'production', 'proof', 'public', 'recycle', 'recycling', 'reduce', 'reduction', 'renewable', 'resource', 'resources', 'responsible', 'retail', 'reuse', 'subsidies', 'supply', 'sustainable', 'tourism', 'vehicles', 'waste', 'wasteful', 'water']
sdg13 = ['action', 'adaptation', 'agreement', 'average', 'capture', 'carbon', 'change', 'changing', 'climate', 'co2', 'conversion', 'cop', 'dioxide', 'disasters', 'disease', 'economy', 'ecosystems', 'emissions', 'events', 'extreme', 'gas', 'gases', 'gender', 'global', 'greenhouse', 'hazards', 'ice', 'impact', 'infectious', 'management', 'mitigation', 'natural', 'ocean', 'paris', 'patterns', 'planning', 'policy', 'politics', 'pollution', 'refugees', 'related', 'renewable', 'resilience', 'rise', 'rising', 'sea', 'systems', 'temperature', 'warming', 'warning', 'weather']
sdg14 = ['acidification', 'areas', 'artisanal', 'biodiversity', 'bleaching', 'carbon', 'coastal', 'coastlines', 'conserve', 'coral', 'dioxide', 'ecosystem', 'ecosystems', 'fish', 'fisheries', 'fishers', 'fishing', 'global', 'grasses', 'habitats', 'illegal', 'kelp', 'law', 'management', 'marine', 'ocean', 'oceanography', 'oceans', 'overfishing', 'parks', 'policy', 'pollution', 'practices', 'productive', 'protected', 'reef', 'resources', 'sea', 'seas', 'species', 'stocks', 'sustainable', 'temperature', 'unregulated', 'warming', 'water']
sdg15 = ['afforestation', 'agriculture', 'alien', 'animals', 'arable', 'bees', 'biodiversity', 'conservation', 'deforestation', 'degradation', 'desertification', 'drought', 'drylands', 'ecosystem', 'ecosystems', 'extinct', 'extinction', 'fauna', 'flora', 'forest', 'forests', 'genetic', 'illegal', 'illicit', 'indigenous', 'invasive', 'land', 'manage', 'managed', 'management', 'microorganisms', 'permaculture', 'plan', 'plants', 'poaching', 'populations', 'poverty', 'products', 'protected', 'reforestation', 'resources', 'restoration', 'soil', 'species', 'strategic', 'sustainability', 'terrestrial', 'threatened', 'trafficking', 'tree', 'wetlands', 'wildlife']
sdg16 = ['abuse', 'access', 'accountability', 'accountable', 'against', 'arbitrary', 'arms', 'assets', 'birth', 'bribery', 'children', 'combat', 'conflict', 'conflicts', 'corruption', 'crime', 'decision_making', 'detainees', 'detention', 'disappearance', 'discrimination', 'displaced', 'education', 'enforced', 'equal', 'equity', 'evasion', 'exploitation', 'financial', 'freedom', 'geography', 'governance', 'hate', 'human', 'identity', 'illegal', 'illicit', 'inclusion', 'inclusive', 'institutions', 'internally', 'judiciary', 'justice', 'law', 'legal', 'life', 'national', 'nonviolence', 'organized', 'paris', 'peace', 'peaceful', 'physical', 'police', 'policy', 'poverty', 'prevent', 'principles', 'psychological', 'public', 'quality', 'rates', 'registration', 'representative', 'resolution', 'rights', 'rule', 'security', 'seizures', 'sexual', 'societies', 'societies', 'society', 'stolen', 'tax', 'terrorism', 'theft', 'threats', 'torture', 'trafficking', 'transparency', 'sentenced', 'unstable', 'victims', 'violence', 'weapon', 'women']
sdg17 = ['agenda', 'agreements', 'aid', 'assistance', 'average', 'building', 'capacity', 'census', 'civil', 'communication', 'cooperation', 'countries', 'data', 'debt', 'developing', 'development', 'disaggregated', 'doha', 'entrepreneurs', 'entrepreneurship', 'environmentally', 'eradication', 'foreign', 'fostering', 'fundamental', 'global', 'housing', 'innovation', 'international', 'investments', 'knowledge', 'stakeholder', 'official', 'organization', 'partnership', 'partnerships', 'population', 'poverty', 'principles', 'public', 'private', 'science', 'sharing', 'society', 'sound', 'stability', 'statistics', 'support', 'sustainability', 'sustainable', 'tariff', 'technologies', 'technology', 'trade', 'transfer', 'weighted', 'women', 'world']
sdg_list = [sdg1, sdg2, sdg3, sdg4, sdg5, sdg6, sdg7, sdg8, sdg9, sdg10, sdg11, sdg12, sdg13, sdg14, sdg15, sdg16, sdg17]

In [10]:
#sdg bag of words
sdg_bow = [id2word.doc2bow(text) for text in sdg_list]

In [11]:
# vectorized the sdg bag of words into a vector
sdg_vectorized = np.zeros((len(sdg_list), len(id2word)))
for i, sdg_bow_i in enumerate(sdg_bow):
    for id, count in sdg_bow_i:
        sdg_vectorized[i][id] = count

In [12]:
#vectorized each document (bag of words) into a vector
corpus_vectorized = np.zeros((len(corpus), len(id2word)))
for i, corpus_i in enumerate(corpus):
    for id, count in corpus_i:
        corpus_vectorized[i][id] = count

In [16]:
#total sdg words matched in a document
sdg_word_match = np.dot(corpus_vectorized,sdg_vectorized.transpose())

#total number of words in each document
total_words = sdg_vectorized.sum(axis=1)

score_vector2 = list()
#filename_list = list()

# probability estimation
for i, corpus_i in enumerate(total_words):
    score_vector2.append(sdg_word_match.T[i]/corpus_i)



# Score Normalization

In [17]:
import numpy

project_sdg_df2 = pd.DataFrame(data=score_vector2)
project_sdg_df2 = project_sdg_df2.transpose()

columns = ['sdg1', 'sdg2', 'sdg3', 'sdg4', 'sdg5', 'sdg6', 'sdg7', 'sdg8', 'sdg9', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg17']

project_sdg_df2.columns = columns

In [18]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

norm_sdg_df2 = pd.DataFrame(min_max_scaler.fit_transform(project_sdg_df2.T).T)

In [19]:
doc_num = len(corpus)
i = 0
cutoff = 0.7
sdg_labels = list()

for i in range(doc_num):
    th1 = list(norm_sdg_df2.iloc[i] >= cutoff)
    sdg_labels.append(th1)

In [20]:
sdg_labels_df2 = pd.DataFrame(data=sdg_labels)
sdg_labels_df2.replace(False, '0', inplace=True)
sdg_labels_df2.replace(True, '1', inplace=True)

sdg_labels_df2.columns = columns
sdg_labels_df2['filename'] = df2['filename']

columns = ['filename', 'sdg1', 'sdg2', 'sdg3', 'sdg4', 'sdg5', 'sdg6', 'sdg7', 'sdg8', 'sdg9', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg17']
sdg_labels_df2 = sdg_labels_df2[columns]

sdg_labels_df2

Unnamed: 0,filename,sdg1,sdg2,sdg3,sdg4,sdg5,sdg6,sdg7,sdg8,sdg9,sdg10,sdg11,sdg12,sdg13,sdg14,sdg15,sdg16,sdg17
0,LGST-242-642.pdf,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,CodeAndPower2017.pdf,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0
2,Time-Machine-syllabus.pdf,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,CMS_332D_Digital_Ethics_Syllabus_Undergr.pdf,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,i453-syllabus.pdf,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,"AI Safety, Ethics, and Policy.txt",0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
162,"Peter Suber, _Consent & Coercion_.txt",1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
163,ICCS_ Course Schedule for Intelligent Control ...,1,0,0,0,1,0,0,1,0,1,0,0,1,0,0,0,1
164,ECS 188 - Ethics in an Age of Technology - Phi...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
sdg_labels_df2.to_excel("SDG_curricula_KMS_70.xlsx", engine='xlsxwriter')

# Load Frameworks

In [22]:
df3 = pd.read_excel('data/all_frameworks.xlsx')

In [23]:
len(df3)

108

In [24]:
corpus, id2word, data_lemmatized = AI_Frameworks_LDA.preprocessData(df3)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/osama.nasir@ebryx.com/anaconda3/envs/myenv/lib/python3.7/site-packages/en_core_web_sm
-->
/home/osama.nasir@ebryx.com/anaconda3/envs/myenv/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [25]:
print(len(corpus))
print(len(id2word))
print(len(data_lemmatized))

108
4811
108


# SDG Model

In [83]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-100")

In [84]:
#defining all 17 SDGs
sdg1 = ['africa', 'basic', 'class', 'countries', 'developing', 'disadvantaged', 'discrimination', 'distribution', 'economic', 'end', 'environment', 'equality', 'eradication', 'extreme', 'financial', 'inclusion', 'income', 'life', 'line', 'microfinance', 'poor', 'poverty', 'protection', 'quality', 'resources', 'services', 'social', 'sustainable', 'systems', 'third', 'vulnerable', 'wealth', 'world']
sdg2 = ['agriculture', 'diversity', 'food', 'genetic', 'genetics', 'growth', 'health', 'hunger', 'hungry', 'improved', 'infrastructure', 'innovations', 'legumes', 'life', 'maize', 'malnourished', 'malnutrition', 'needs', 'nourished', 'nutrition', 'nutritional', 'nutritious', 'people', 'poverty', 'produce', 'producers', 'production', 'productivity', 'quality', 'resilient', 'restrictions', 'rural', 'scale', 'seeds', 'small', 'stunted', 'stunting', 'sufficient', 'sustainable', 'trade', 'under', 'undernourished', 'wasting', 'world']
sdg3 = ['abuse', 'access', 'accidents', 'affordable', 'aids', 'air', 'alcohol', 'antenatal', 'antiretroviral', 'autonomy', 'biomedical', 'bodily', 'borne', 'care', 'child', 'clean', 'constrained', 'contamination', 'contraceptive', 'control', 'countries', 'coverage', 'death', 'deaths', 'density', 'dental', 'developing', 'disability', 'disease', 'diseases', 'drug', 'expectancy', 'family', 'health', 'healthcare', 'healthy', 'hepatitis', 'hiv', 'hygiene', 'improving', 'inclusion', 'increasing', 'indigenous', 'infected', 'international', 'life', 'lives', 'location', 'malaria', 'maternal', 'measles', 'medical', 'medicines', 'mental', 'mortality', 'narcotic', 'neonatal', 'organisation', 'planning', 'policy', 'polio', 'politics', 'pollution', 'premature', 'preventable', 'reducing', 'refugees', 'regulations', 'reproductive', 'resource', 'road', 'sanitation', 'services', 'sexual', 'soil', 'substance', 'support', 'therapy', 'tobacco', 'traffic', 'treatment', 'tuberculosis', 'universal', 'use', 'vaccines', 'violence', 'wash', 'water', 'well', 'wellbeing', 'worker', 'world']
sdg4 = ['access', 'basic', 'childhood', 'citizenship', 'cooperation', 'cultural', 'developing', 'development', 'disability', 'disparities', 'disparity', 'diversity', 'education', 'enrolment', 'equal', 'equality', 'equitable', 'equity', 'gender', 'global', 'inclusion', 'inclusive', 'innovation', 'international', 'learning', 'lifelong', 'literacy', 'numeracy', 'opportunities', 'preprimary', 'primary', 'qualified', 'refugees', 'rights', 'scholarships', 'school', 'secondary', 'sensitive', 'skills', 'sustainability', 'teacher', 'teachers', 'training', 'universal', 'vocational', 'vulnerable', 'women']
sdg5 = ['access', 'against', 'basic', 'coverage', 'dignity', 'disadvantaged', 'discrimination', 'employment', 'empower', 'empowerment', 'equal', 'equality', 'exploitation', 'female', 'feminism', 'forced', 'gender', 'genital', 'girls', 'governance', 'health', 'human', 'humanitarian', 'inclusion', 'living', 'marginalised', 'marriage', 'mutilation', 'opportunities', 'parity', 'pay', 'reproductive', 'rights', 'sexual', 'social', 'standards', 'trafficking', 'universal', 'violence', 'women', 'work', 'workplace']
sdg6 = ['access', 'accessible', 'affordable', 'aquifer', 'basins', 'cities', 'clean', 'contaminated', 'defecation', 'desalination', 'diarrhoeal', 'disasters', 'diseases', 'drinking', 'drought', 'dumping', 'ecosystem', 'ecosystems', 'efficiency', 'equitable', 'floods', 'harvesting', 'hydropower', 'hygiene', 'improving', 'inadequate', 'infrastructure', 'irrigation', 'lakes', 'latrines', 'management', 'pollution', 'protection', 'quality', 'recycled', 'resources', 'restoration', 'reuse', 'river', 'rivers', 'safe', 'sanitation', 'scarcity', 'sewerage', 'supply', 'sustainable', 'third', 'toilets', 'treatment', 'untreated', 'urban', 'waste', 'wastewater', 'water', 'water_related', 'water_use', 'withdrawals', 'world']
sdg7 = ['affordable', 'alternative', 'animal', 'battery', 'carbon', 'charcoal', 'clean', 'cleaner', 'climate', 'coal', 'economy', 'efficiency', 'electricity', 'emissions', 'energy', 'fossil', 'fossil_fuel', 'fuel', 'fuels', 'gas', 'goal', 'green', 'greenhouse', 'hydroelectric', 'infrastructure', 'modern', 'power', 'reliable', 'renewable', 'research', 'services', 'solar', 'sustainable', 'technology', 'turbine', 'vehicles', 'waste', 'wave', 'wind', 'wood']
sdg8 = ['aid', 'banking', 'child', 'consumption', 'creation', 'creativity', 'culture', 'development', 'domestic', 'economic', 'economy', 'efficiency', 'employment', 'enterprises', 'entrepreneurship', 'equal', 'eradication', 'finance', 'financial', 'forced', 'gdp', 'global', 'gross', 'growth', 'human', 'inclusive', 'innovation', 'insurance', 'job', 'jobs', 'labour', 'life', 'market', 'micro', 'migrant', 'migrants', 'modern', 'opportunities', 'oriented', 'pay', 'policies', 'policy', 'poverty', 'product', 'production', 'productive', 'productivity', 'public', 'quality', 'resource', 'rights', 'safe', 'secure', 'services', 'slavery', 'social', 'society', 'soldiers', 'stable', 'sustainable', 'tourism', 'trade', 'trafficking', 'unemployment', 'paid', 'women', 'work', 'workers', 'world', 'youth']
sdg9 = ['access', 'affordable', 'banks', 'capabilities', 'chains', 'clean', 'communication', 'cooperation', 'countries', 'credit', 'data', 'developing', 'development', 'diversification', 'economic', 'education', 'efficiency', 'electrical', 'energy', 'enterprises', 'environmentally', 'financial', 'ict', 'industrial', 'industrialisation', 'information', 'infrastructure', 'innovation', 'internet', 'irrigation', 'knowledge', 'markets', 'mobile', 'national', 'network', 'networks', 'phone', 'policy', 'power', 'public', 'quality', 'regional', 'research', 'resilient', 'resource', 'resources', 'roads', 'sanitation', 'scientific', 'security', 'service', 'services', 'society', 'sustainable', 'technological', 'technologies', 'technology', 'trade', 'transborder', 'transport', 'value', 'water']
sdg10 = ['affordable', 'age', 'ageism', 'aid', 'assistance', 'business', 'children', 'countries', 'culture', 'developing', 'development', 'disabilities', 'discrimination', 'discriminatory', 'economy', 'education', 'empower', 'equal', 'equality', 'equity', 'ethnicity', 'financial', 'foreign', 'gender', 'global', 'growth', 'health', 'homelessness', 'homophobia', 'housing', 'human', 'inclusion', 'income', 'indigenous', 'inequalities', 'inequality', 'investment', 'markets', 'migrant', 'migration', 'nations', 'opportunity', 'policy', 'population', 'poverty', 'protection', 'public', 'quality', 'race', 'racisim', 'reduce', 'religion', 'remittance', 'rights', 'rural', 'sex', 'sexism', 'social', 'society', 'states', 'trade', 'vulnerable', 'world']
sdg11 = ['adaptable', 'adaptation', 'affordable', 'air', 'buildings', 'building', 'change', 'cities', 'city', 'climate', 'communities', 'community', 'consumption', 'crowding', 'cultural', 'decentralisation', 'development', 'disaster', 'disasters', 'efficiency', 'fine', 'generation', 'green', 'growth', 'heritage', 'housing', 'human', 'impact', 'inadequate', 'informal', 'infrastructure', 'land', 'local', 'management', 'materials', 'matter', 'mitigation', 'natural', 'particulate', 'planning', 'pollution', 'population', 'public', 'quality', 'reduction', 'resilient', 'resource', 'risk', 'road', 'safe', 'safety', 'settlements', 'shanty', 'slums', 'smart', 'solid', 'spaces', 'strategy', 'suburban', 'sustainability', 'sustainable', 'systems', 'town', 'transport', 'urban', 'urbanisation', 'waste', 'water', 'water_related']
sdg12 = ['capitalism', 'cars', 'chain', 'chains', 'circular', 'commercial', 'consumer', 'consumerism', 'consumption', 'cycle', 'decarbonisation', 'development', 'distortions', 'ecological', 'economy', 'efficiency', 'efficient', 'energy', 'enterprises', 'food', 'fossil', 'fuel', 'future', 'gasses', 'generation', 'global', 'goods', 'greenhouse', 'harvest', 'industry', 'levels', 'losses', 'management', 'market', 'materialism', 'materials', 'monitoring', 'natural', 'obsolescence', 'overconsumption', 'pollution', 'practices', 'procurement', 'production', 'proof', 'public', 'recycle', 'recycling', 'reduce', 'reduction', 'renewable', 'resource', 'resources', 'responsible', 'retail', 'reuse', 'subsidies', 'supply', 'sustainable', 'tourism', 'vehicles', 'waste', 'wasteful', 'water']
sdg13 = ['action', 'adaptation', 'agreement', 'average', 'capture', 'carbon', 'change', 'changing', 'climate', 'co2', 'conversion', 'cop', 'dioxide', 'disasters', 'disease', 'economy', 'ecosystems', 'emissions', 'events', 'extreme', 'gas', 'gases', 'gender', 'global', 'greenhouse', 'hazards', 'ice', 'impact', 'infectious', 'management', 'mitigation', 'natural', 'ocean', 'paris', 'patterns', 'planning', 'policy', 'politics', 'pollution', 'refugees', 'related', 'renewable', 'resilience', 'rise', 'rising', 'sea', 'systems', 'temperature', 'warming', 'warning', 'weather']
sdg14 = ['acidification', 'areas', 'artisanal', 'biodiversity', 'bleaching', 'carbon', 'coastal', 'coastlines', 'conserve', 'coral', 'dioxide', 'ecosystem', 'ecosystems', 'fish', 'fisheries', 'fishers', 'fishing', 'global', 'grasses', 'habitats', 'illegal', 'kelp', 'law', 'management', 'marine', 'ocean', 'oceanography', 'oceans', 'overfishing', 'parks', 'policy', 'pollution', 'practices', 'productive', 'protected', 'reef', 'resources', 'sea', 'seas', 'species', 'stocks', 'sustainable', 'temperature', 'unregulated', 'warming', 'water']
sdg15 = ['afforestation', 'agriculture', 'alien', 'animals', 'arable', 'bees', 'biodiversity', 'conservation', 'deforestation', 'degradation', 'desertification', 'drought', 'drylands', 'ecosystem', 'ecosystems', 'extinct', 'extinction', 'fauna', 'flora', 'forest', 'forests', 'genetic', 'illegal', 'illicit', 'indigenous', 'invasive', 'land', 'manage', 'managed', 'management', 'microorganisms', 'permaculture', 'plan', 'plants', 'poaching', 'populations', 'poverty', 'products', 'protected', 'reforestation', 'resources', 'restoration', 'soil', 'species', 'strategic', 'sustainability', 'terrestrial', 'threatened', 'trafficking', 'tree', 'wetlands', 'wildlife']
sdg16 = ['abuse', 'access', 'accountability', 'accountable', 'against', 'arbitrary', 'arms', 'assets', 'birth', 'bribery', 'children', 'combat', 'conflict', 'conflicts', 'corruption', 'crime', 'decision_making', 'detainees', 'detention', 'disappearance', 'discrimination', 'displaced', 'education', 'enforced', 'equal', 'equity', 'evasion', 'exploitation', 'financial', 'freedom', 'geography', 'governance', 'hate', 'human', 'identity', 'illegal', 'illicit', 'inclusion', 'inclusive', 'institutions', 'internally', 'judiciary', 'justice', 'law', 'legal', 'life', 'national', 'nonviolence', 'organized', 'paris', 'peace', 'peaceful', 'physical', 'police', 'policy', 'poverty', 'prevent', 'principles', 'psychological', 'public', 'quality', 'rates', 'registration', 'representative', 'resolution', 'rights', 'rule', 'security', 'seizures', 'sexual', 'societies', 'societies', 'society', 'stolen', 'tax', 'terrorism', 'theft', 'threats', 'torture', 'trafficking', 'transparency', 'sentenced', 'unstable', 'victims', 'violence', 'weapon', 'women']
sdg17 = ['agenda', 'agreements', 'aid', 'assistance', 'average', 'building', 'capacity', 'census', 'civil', 'communication', 'cooperation', 'countries', 'data', 'debt', 'developing', 'development', 'disaggregated', 'doha', 'entrepreneurs', 'entrepreneurship', 'environmentally', 'eradication', 'foreign', 'fostering', 'fundamental', 'global', 'housing', 'innovation', 'international', 'investments', 'knowledge', 'stakeholder', 'official', 'organization', 'partnership', 'partnerships', 'population', 'poverty', 'principles', 'public', 'private', 'science', 'sharing', 'society', 'sound', 'stability', 'statistics', 'support', 'sustainability', 'sustainable', 'tariff', 'technologies', 'technology', 'trade', 'transfer', 'weighted', 'women', 'world']
sdg_list = [sdg1, sdg2, sdg3, sdg4, sdg5, sdg6, sdg7, sdg8, sdg9, sdg10, sdg11, sdg12, sdg13, sdg14, sdg15, sdg16, sdg17]

In [85]:
documents = data_lemmatized
sdg_l = sdg_list

score_vector3 = np.zeros((len(documents), len(sdg_l)))

for d, doc in tqdm(enumerate(documents)):
    d_len = len(documents[d])
    #print(d)
    for s, sdg in enumerate(sdg_l):
        s_len = len(sdg_l[s])
        score = 0
        for keyword in sdg:
            for word in doc:
                try:
                    sim = model.similarity(word, keyword)
                    if (sim >= 0.75):
                        score += sim 
                except:
                    pass
        score_vector3[d][s] = score / s_len

108it [37:47, 20.99s/it]


### KEYWORD matching (word_count/sdg_len)

In [26]:
#defining all 17 SDGs
sdg1 = ['africa', 'basic', 'class', 'countries', 'developing', 'disadvantaged', 'discrimination', 'distribution', 'economic', 'end', 'environment', 'equality', 'eradication', 'extreme', 'financial', 'inclusion', 'income', 'life', 'line', 'microfinance', 'poor', 'poverty', 'protection', 'quality', 'resources', 'services', 'social', 'sustainable', 'systems', 'third', 'vulnerable', 'wealth', 'world']
sdg2 = ['agriculture', 'diversity', 'food', 'genetic', 'genetics', 'growth', 'health', 'hunger', 'hungry', 'improved', 'infrastructure', 'innovations', 'legumes', 'life', 'maize', 'malnourished', 'malnutrition', 'needs', 'nourished', 'nutrition', 'nutritional', 'nutritious', 'people', 'poverty', 'produce', 'producers', 'production', 'productivity', 'quality', 'resilient', 'restrictions', 'rural', 'scale', 'seeds', 'small', 'stunted', 'stunting', 'sufficient', 'sustainable', 'trade', 'under', 'undernourished', 'wasting', 'world']
sdg3 = ['abuse', 'access', 'accidents', 'affordable', 'aids', 'air', 'alcohol', 'antenatal', 'antiretroviral', 'autonomy', 'biomedical', 'bodily', 'borne', 'care', 'child', 'clean', 'constrained', 'contamination', 'contraceptive', 'control', 'countries', 'coverage', 'death', 'deaths', 'density', 'dental', 'developing', 'disability', 'disease', 'diseases', 'drug', 'expectancy', 'family', 'health', 'healthcare', 'healthy', 'hepatitis', 'hiv', 'hygiene', 'improving', 'inclusion', 'increasing', 'indigenous', 'infected', 'international', 'life', 'lives', 'location', 'malaria', 'maternal', 'measles', 'medical', 'medicines', 'mental', 'mortality', 'narcotic', 'neonatal', 'organisation', 'planning', 'policy', 'polio', 'politics', 'pollution', 'premature', 'preventable', 'reducing', 'refugees', 'regulations', 'reproductive', 'resource', 'road', 'sanitation', 'services', 'sexual', 'soil', 'substance', 'support', 'therapy', 'tobacco', 'traffic', 'treatment', 'tuberculosis', 'universal', 'use', 'vaccines', 'violence', 'wash', 'water', 'well', 'wellbeing', 'worker', 'world']
sdg4 = ['access', 'basic', 'childhood', 'citizenship', 'cooperation', 'cultural', 'developing', 'development', 'disability', 'disparities', 'disparity', 'diversity', 'education', 'enrolment', 'equal', 'equality', 'equitable', 'equity', 'gender', 'global', 'inclusion', 'inclusive', 'innovation', 'international', 'learning', 'lifelong', 'literacy', 'numeracy', 'opportunities', 'preprimary', 'primary', 'qualified', 'refugees', 'rights', 'scholarships', 'school', 'secondary', 'sensitive', 'skills', 'sustainability', 'teacher', 'teachers', 'training', 'universal', 'vocational', 'vulnerable', 'women']
sdg5 = ['access', 'against', 'basic', 'coverage', 'dignity', 'disadvantaged', 'discrimination', 'employment', 'empower', 'empowerment', 'equal', 'equality', 'exploitation', 'female', 'feminism', 'forced', 'gender', 'genital', 'girls', 'governance', 'health', 'human', 'humanitarian', 'inclusion', 'living', 'marginalised', 'marriage', 'mutilation', 'opportunities', 'parity', 'pay', 'reproductive', 'rights', 'sexual', 'social', 'standards', 'trafficking', 'universal', 'violence', 'women', 'work', 'workplace']
sdg6 = ['access', 'accessible', 'affordable', 'aquifer', 'basins', 'cities', 'clean', 'contaminated', 'defecation', 'desalination', 'diarrhoeal', 'disasters', 'diseases', 'drinking', 'drought', 'dumping', 'ecosystem', 'ecosystems', 'efficiency', 'equitable', 'floods', 'harvesting', 'hydropower', 'hygiene', 'improving', 'inadequate', 'infrastructure', 'irrigation', 'lakes', 'latrines', 'management', 'pollution', 'protection', 'quality', 'recycled', 'resources', 'restoration', 'reuse', 'river', 'rivers', 'safe', 'sanitation', 'scarcity', 'sewerage', 'supply', 'sustainable', 'third', 'toilets', 'treatment', 'untreated', 'urban', 'waste', 'wastewater', 'water', 'water_related', 'water_use', 'withdrawals', 'world']
sdg7 = ['affordable', 'alternative', 'animal', 'battery', 'carbon', 'charcoal', 'clean', 'cleaner', 'climate', 'coal', 'economy', 'efficiency', 'electricity', 'emissions', 'energy', 'fossil', 'fossil_fuel', 'fuel', 'fuels', 'gas', 'goal', 'green', 'greenhouse', 'hydroelectric', 'infrastructure', 'modern', 'power', 'reliable', 'renewable', 'research', 'services', 'solar', 'sustainable', 'technology', 'turbine', 'vehicles', 'waste', 'wave', 'wind', 'wood']
sdg8 = ['aid', 'banking', 'child', 'consumption', 'creation', 'creativity', 'culture', 'development', 'domestic', 'economic', 'economy', 'efficiency', 'employment', 'enterprises', 'entrepreneurship', 'equal', 'eradication', 'finance', 'financial', 'forced', 'gdp', 'global', 'gross', 'growth', 'human', 'inclusive', 'innovation', 'insurance', 'job', 'jobs', 'labour', 'life', 'market', 'micro', 'migrant', 'migrants', 'modern', 'opportunities', 'oriented', 'pay', 'policies', 'policy', 'poverty', 'product', 'production', 'productive', 'productivity', 'public', 'quality', 'resource', 'rights', 'safe', 'secure', 'services', 'slavery', 'social', 'society', 'soldiers', 'stable', 'sustainable', 'tourism', 'trade', 'trafficking', 'unemployment', 'paid', 'women', 'work', 'workers', 'world', 'youth']
sdg9 = ['access', 'affordable', 'banks', 'capabilities', 'chains', 'clean', 'communication', 'cooperation', 'countries', 'credit', 'data', 'developing', 'development', 'diversification', 'economic', 'education', 'efficiency', 'electrical', 'energy', 'enterprises', 'environmentally', 'financial', 'ict', 'industrial', 'industrialisation', 'information', 'infrastructure', 'innovation', 'internet', 'irrigation', 'knowledge', 'markets', 'mobile', 'national', 'network', 'networks', 'phone', 'policy', 'power', 'public', 'quality', 'regional', 'research', 'resilient', 'resource', 'resources', 'roads', 'sanitation', 'scientific', 'security', 'service', 'services', 'society', 'sustainable', 'technological', 'technologies', 'technology', 'trade', 'transborder', 'transport', 'value', 'water']
sdg10 = ['affordable', 'age', 'ageism', 'aid', 'assistance', 'business', 'children', 'countries', 'culture', 'developing', 'development', 'disabilities', 'discrimination', 'discriminatory', 'economy', 'education', 'empower', 'equal', 'equality', 'equity', 'ethnicity', 'financial', 'foreign', 'gender', 'global', 'growth', 'health', 'homelessness', 'homophobia', 'housing', 'human', 'inclusion', 'income', 'indigenous', 'inequalities', 'inequality', 'investment', 'markets', 'migrant', 'migration', 'nations', 'opportunity', 'policy', 'population', 'poverty', 'protection', 'public', 'quality', 'race', 'racisim', 'reduce', 'religion', 'remittance', 'rights', 'rural', 'sex', 'sexism', 'social', 'society', 'states', 'trade', 'vulnerable', 'world']
sdg11 = ['adaptable', 'adaptation', 'affordable', 'air', 'buildings', 'building', 'change', 'cities', 'city', 'climate', 'communities', 'community', 'consumption', 'crowding', 'cultural', 'decentralisation', 'development', 'disaster', 'disasters', 'efficiency', 'fine', 'generation', 'green', 'growth', 'heritage', 'housing', 'human', 'impact', 'inadequate', 'informal', 'infrastructure', 'land', 'local', 'management', 'materials', 'matter', 'mitigation', 'natural', 'particulate', 'planning', 'pollution', 'population', 'public', 'quality', 'reduction', 'resilient', 'resource', 'risk', 'road', 'safe', 'safety', 'settlements', 'shanty', 'slums', 'smart', 'solid', 'spaces', 'strategy', 'suburban', 'sustainability', 'sustainable', 'systems', 'town', 'transport', 'urban', 'urbanisation', 'waste', 'water', 'water_related']
sdg12 = ['capitalism', 'cars', 'chain', 'chains', 'circular', 'commercial', 'consumer', 'consumerism', 'consumption', 'cycle', 'decarbonisation', 'development', 'distortions', 'ecological', 'economy', 'efficiency', 'efficient', 'energy', 'enterprises', 'food', 'fossil', 'fuel', 'future', 'gasses', 'generation', 'global', 'goods', 'greenhouse', 'harvest', 'industry', 'levels', 'losses', 'management', 'market', 'materialism', 'materials', 'monitoring', 'natural', 'obsolescence', 'overconsumption', 'pollution', 'practices', 'procurement', 'production', 'proof', 'public', 'recycle', 'recycling', 'reduce', 'reduction', 'renewable', 'resource', 'resources', 'responsible', 'retail', 'reuse', 'subsidies', 'supply', 'sustainable', 'tourism', 'vehicles', 'waste', 'wasteful', 'water']
sdg13 = ['action', 'adaptation', 'agreement', 'average', 'capture', 'carbon', 'change', 'changing', 'climate', 'co2', 'conversion', 'cop', 'dioxide', 'disasters', 'disease', 'economy', 'ecosystems', 'emissions', 'events', 'extreme', 'gas', 'gases', 'gender', 'global', 'greenhouse', 'hazards', 'ice', 'impact', 'infectious', 'management', 'mitigation', 'natural', 'ocean', 'paris', 'patterns', 'planning', 'policy', 'politics', 'pollution', 'refugees', 'related', 'renewable', 'resilience', 'rise', 'rising', 'sea', 'systems', 'temperature', 'warming', 'warning', 'weather']
sdg14 = ['acidification', 'areas', 'artisanal', 'biodiversity', 'bleaching', 'carbon', 'coastal', 'coastlines', 'conserve', 'coral', 'dioxide', 'ecosystem', 'ecosystems', 'fish', 'fisheries', 'fishers', 'fishing', 'global', 'grasses', 'habitats', 'illegal', 'kelp', 'law', 'management', 'marine', 'ocean', 'oceanography', 'oceans', 'overfishing', 'parks', 'policy', 'pollution', 'practices', 'productive', 'protected', 'reef', 'resources', 'sea', 'seas', 'species', 'stocks', 'sustainable', 'temperature', 'unregulated', 'warming', 'water']
sdg15 = ['afforestation', 'agriculture', 'alien', 'animals', 'arable', 'bees', 'biodiversity', 'conservation', 'deforestation', 'degradation', 'desertification', 'drought', 'drylands', 'ecosystem', 'ecosystems', 'extinct', 'extinction', 'fauna', 'flora', 'forest', 'forests', 'genetic', 'illegal', 'illicit', 'indigenous', 'invasive', 'land', 'manage', 'managed', 'management', 'microorganisms', 'permaculture', 'plan', 'plants', 'poaching', 'populations', 'poverty', 'products', 'protected', 'reforestation', 'resources', 'restoration', 'soil', 'species', 'strategic', 'sustainability', 'terrestrial', 'threatened', 'trafficking', 'tree', 'wetlands', 'wildlife']
sdg16 = ['abuse', 'access', 'accountability', 'accountable', 'against', 'arbitrary', 'arms', 'assets', 'birth', 'bribery', 'children', 'combat', 'conflict', 'conflicts', 'corruption', 'crime', 'decision_making', 'detainees', 'detention', 'disappearance', 'discrimination', 'displaced', 'education', 'enforced', 'equal', 'equity', 'evasion', 'exploitation', 'financial', 'freedom', 'geography', 'governance', 'hate', 'human', 'identity', 'illegal', 'illicit', 'inclusion', 'inclusive', 'institutions', 'internally', 'judiciary', 'justice', 'law', 'legal', 'life', 'national', 'nonviolence', 'organized', 'paris', 'peace', 'peaceful', 'physical', 'police', 'policy', 'poverty', 'prevent', 'principles', 'psychological', 'public', 'quality', 'rates', 'registration', 'representative', 'resolution', 'rights', 'rule', 'security', 'seizures', 'sexual', 'societies', 'societies', 'society', 'stolen', 'tax', 'terrorism', 'theft', 'threats', 'torture', 'trafficking', 'transparency', 'sentenced', 'unstable', 'victims', 'violence', 'weapon', 'women']
sdg17 = ['agenda', 'agreements', 'aid', 'assistance', 'average', 'building', 'capacity', 'census', 'civil', 'communication', 'cooperation', 'countries', 'data', 'debt', 'developing', 'development', 'disaggregated', 'doha', 'entrepreneurs', 'entrepreneurship', 'environmentally', 'eradication', 'foreign', 'fostering', 'fundamental', 'global', 'housing', 'innovation', 'international', 'investments', 'knowledge', 'stakeholder', 'official', 'organization', 'partnership', 'partnerships', 'population', 'poverty', 'principles', 'public', 'private', 'science', 'sharing', 'society', 'sound', 'stability', 'statistics', 'support', 'sustainability', 'sustainable', 'tariff', 'technologies', 'technology', 'trade', 'transfer', 'weighted', 'women', 'world']
sdg_list = [sdg1, sdg2, sdg3, sdg4, sdg5, sdg6, sdg7, sdg8, sdg9, sdg10, sdg11, sdg12, sdg13, sdg14, sdg15, sdg16, sdg17]

In [28]:
#sdg bag of words
sdg_bow = [id2word.doc2bow(text) for text in sdg_list]

# vectorized the sdg bag of words into a vector
sdg_vectorized = np.zeros((len(sdg_list), len(id2word)))
for i, sdg_bow_i in enumerate(sdg_bow):
    for id, count in sdg_bow_i:
        sdg_vectorized[i][id] = count
        
# vectorized the sdg bag of words into a vector
sdg_vectorized = np.zeros((len(sdg_list), len(id2word)))
for i, sdg_bow_i in enumerate(sdg_bow):
    for id, count in sdg_bow_i:
        sdg_vectorized[i][id] = count
        
#vectorized each document (bag of words) into a vector
corpus_vectorized = np.zeros((len(corpus), len(id2word)))
for i, corpus_i in enumerate(corpus):
    for id, count in corpus_i:
        corpus_vectorized[i][id] = count
        
#total sdg words matched in a document
sdg_word_match = np.dot(corpus_vectorized,sdg_vectorized.transpose())

#total number of words in each document
total_words = sdg_vectorized.sum(axis=1)

score_vector3 = list()

# probability estimation
for i, corpus_i in enumerate(total_words):
    score_vector3.append(sdg_word_match.T[i]/corpus_i)

# Score Normalization

In [29]:
import numpy

project_sdg_df3 = pd.DataFrame(data=score_vector3)
project_sdg_df3 = project_sdg_df3.transpose()

columns = ['sdg1', 'sdg2', 'sdg3', 'sdg4', 'sdg5', 'sdg6', 'sdg7', 'sdg8', 'sdg9', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg17']

project_sdg_df3.columns = columns

In [30]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

norm_sdg_df3 = pd.DataFrame(min_max_scaler.fit_transform(project_sdg_df3.T).T)

In [31]:
doc_num = 108
i = 0
cutoff = 0.7
sdg_labels = list()

for i in range(doc_num):
    th1 = list(norm_sdg_df3.iloc[i] >= cutoff)
    sdg_labels.append(th1)

In [32]:
sdg_labels_df3 = pd.DataFrame(data=sdg_labels)
sdg_labels_df3.replace(False, '0', inplace=True)
sdg_labels_df3.replace(True, '1', inplace=True)

sdg_labels_df3.columns = columns
sdg_labels_df3['filename'] = df3['filename']

columns = ['filename', 'sdg1', 'sdg2', 'sdg3', 'sdg4', 'sdg5', 'sdg6', 'sdg7', 'sdg8', 'sdg9', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg17']
sdg_labels_df3 = sdg_labels_df3[columns]

sdg_labels_df3

Unnamed: 0,filename,sdg1,sdg2,sdg3,sdg4,sdg5,sdg6,sdg7,sdg8,sdg9,sdg10,sdg11,sdg12,sdg13,sdg14,sdg15,sdg16,sdg17
0,Model-AI-Framework-First-Edition.pdf,0,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0
1,Data_Ethics_Framework_2020.pdf,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1
2,ai-in-health-ethical-social-political-challeng...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,SAP’s Guiding Principles for Artificial Intell...,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0
4,Privacy-and-Freedom-of-Expression-In-the-Age-o...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,Our Principles – Google AI.html,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
104,Principles for Accountable Algorithms and a So...,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1
105,Responsible AI principles from Microsoft.html,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0
106,Ethics & Society _ DeepMind.txt,1,0,0,0,1,0,1,1,1,0,0,0,0,0,0,1,0


# Save results to file

In [33]:
sdg_labels_df3.to_excel("SDG_frameworks_KMS_70.xlsx", engine='xlsxwriter')

# Save Score Results

In [88]:
project_sdg_df1.to_excel("Score_KM_project.xlsx", engine='xlsxwriter')
project_sdg_df2.to_excel("Score_KM_curricula.xlsx", engine='xlsxwriter')
project_sdg_df3.to_excel("Score_KM_framework.xlsx", engine='xlsxwriter')

# Load Research Papers

In [35]:
scopus_df = pd.read_csv('data/scopus.csv')
df4 = pd.DataFrame()
df4['Title'] = scopus_df['Title']
df4['text'] = scopus_df['Abstract']

In [36]:
len(df4)

200

In [37]:
corpus, id2word, data_lemmatized = AI_Frameworks_LDA.preprocessData(df4)

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/osama.nasir@ebryx.com/anaconda3/envs/myenv/lib/python3.7/site-packages/en_core_web_sm
-->
/home/osama.nasir@ebryx.com/anaconda3/envs/myenv/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [38]:
print(len(corpus))
print(len(id2word))
print(len(data_lemmatized))

200
2009
200


# SDG Model

In [17]:
import gensim.downloader as api
model = api.load("glove-wiki-gigaword-100")

In [18]:
#defining all 17 SDGs
sdg1 = ['africa', 'basic', 'class', 'countries', 'developing', 'disadvantaged', 'discrimination', 'distribution', 'economic', 'end', 'environment', 'equality', 'eradication', 'extreme', 'financial', 'inclusion', 'income', 'life', 'line', 'microfinance', 'poor', 'poverty', 'protection', 'quality', 'resources', 'services', 'social', 'sustainable', 'systems', 'third', 'vulnerable', 'wealth', 'world']
sdg2 = ['agriculture', 'diversity', 'food', 'genetic', 'genetics', 'growth', 'health', 'hunger', 'hungry', 'improved', 'infrastructure', 'innovations', 'legumes', 'life', 'maize', 'malnourished', 'malnutrition', 'needs', 'nourished', 'nutrition', 'nutritional', 'nutritious', 'people', 'poverty', 'produce', 'producers', 'production', 'productivity', 'quality', 'resilient', 'restrictions', 'rural', 'scale', 'seeds', 'small', 'stunted', 'stunting', 'sufficient', 'sustainable', 'trade', 'under', 'undernourished', 'wasting', 'world']
sdg3 = ['abuse', 'access', 'accidents', 'affordable', 'aids', 'air', 'alcohol', 'antenatal', 'antiretroviral', 'autonomy', 'biomedical', 'bodily', 'borne', 'care', 'child', 'clean', 'constrained', 'contamination', 'contraceptive', 'control', 'countries', 'coverage', 'death', 'deaths', 'density', 'dental', 'developing', 'disability', 'disease', 'diseases', 'drug', 'expectancy', 'family', 'health', 'healthcare', 'healthy', 'hepatitis', 'hiv', 'hygiene', 'improving', 'inclusion', 'increasing', 'indigenous', 'infected', 'international', 'life', 'lives', 'location', 'malaria', 'maternal', 'measles', 'medical', 'medicines', 'mental', 'mortality', 'narcotic', 'neonatal', 'organisation', 'planning', 'policy', 'polio', 'politics', 'pollution', 'premature', 'preventable', 'reducing', 'refugees', 'regulations', 'reproductive', 'resource', 'road', 'sanitation', 'services', 'sexual', 'soil', 'substance', 'support', 'therapy', 'tobacco', 'traffic', 'treatment', 'tuberculosis', 'universal', 'use', 'vaccines', 'violence', 'wash', 'water', 'well', 'wellbeing', 'worker', 'world']
sdg4 = ['access', 'basic', 'childhood', 'citizenship', 'cooperation', 'cultural', 'developing', 'development', 'disability', 'disparities', 'disparity', 'diversity', 'education', 'enrolment', 'equal', 'equality', 'equitable', 'equity', 'gender', 'global', 'inclusion', 'inclusive', 'innovation', 'international', 'learning', 'lifelong', 'literacy', 'numeracy', 'opportunities', 'preprimary', 'primary', 'qualified', 'refugees', 'rights', 'scholarships', 'school', 'secondary', 'sensitive', 'skills', 'sustainability', 'teacher', 'teachers', 'training', 'universal', 'vocational', 'vulnerable', 'women']
sdg5 = ['access', 'against', 'basic', 'coverage', 'dignity', 'disadvantaged', 'discrimination', 'employment', 'empower', 'empowerment', 'equal', 'equality', 'exploitation', 'female', 'feminism', 'forced', 'gender', 'genital', 'girls', 'governance', 'health', 'human', 'humanitarian', 'inclusion', 'living', 'marginalised', 'marriage', 'mutilation', 'opportunities', 'parity', 'pay', 'reproductive', 'rights', 'sexual', 'social', 'standards', 'trafficking', 'universal', 'violence', 'women', 'work', 'workplace']
sdg6 = ['access', 'accessible', 'affordable', 'aquifer', 'basins', 'cities', 'clean', 'contaminated', 'defecation', 'desalination', 'diarrhoeal', 'disasters', 'diseases', 'drinking', 'drought', 'dumping', 'ecosystem', 'ecosystems', 'efficiency', 'equitable', 'floods', 'harvesting', 'hydropower', 'hygiene', 'improving', 'inadequate', 'infrastructure', 'irrigation', 'lakes', 'latrines', 'management', 'pollution', 'protection', 'quality', 'recycled', 'resources', 'restoration', 'reuse', 'river', 'rivers', 'safe', 'sanitation', 'scarcity', 'sewerage', 'supply', 'sustainable', 'third', 'toilets', 'treatment', 'untreated', 'urban', 'waste', 'wastewater', 'water', 'water_related', 'water_use', 'withdrawals', 'world']
sdg7 = ['affordable', 'alternative', 'animal', 'battery', 'carbon', 'charcoal', 'clean', 'cleaner', 'climate', 'coal', 'economy', 'efficiency', 'electricity', 'emissions', 'energy', 'fossil', 'fossil_fuel', 'fuel', 'fuels', 'gas', 'goal', 'green', 'greenhouse', 'hydroelectric', 'infrastructure', 'modern', 'power', 'reliable', 'renewable', 'research', 'services', 'solar', 'sustainable', 'technology', 'turbine', 'vehicles', 'waste', 'wave', 'wind', 'wood']
sdg8 = ['aid', 'banking', 'child', 'consumption', 'creation', 'creativity', 'culture', 'development', 'domestic', 'economic', 'economy', 'efficiency', 'employment', 'enterprises', 'entrepreneurship', 'equal', 'eradication', 'finance', 'financial', 'forced', 'gdp', 'global', 'gross', 'growth', 'human', 'inclusive', 'innovation', 'insurance', 'job', 'jobs', 'labour', 'life', 'market', 'micro', 'migrant', 'migrants', 'modern', 'opportunities', 'oriented', 'pay', 'policies', 'policy', 'poverty', 'product', 'production', 'productive', 'productivity', 'public', 'quality', 'resource', 'rights', 'safe', 'secure', 'services', 'slavery', 'social', 'society', 'soldiers', 'stable', 'sustainable', 'tourism', 'trade', 'trafficking', 'unemployment', 'paid', 'women', 'work', 'workers', 'world', 'youth']
sdg9 = ['access', 'affordable', 'banks', 'capabilities', 'chains', 'clean', 'communication', 'cooperation', 'countries', 'credit', 'data', 'developing', 'development', 'diversification', 'economic', 'education', 'efficiency', 'electrical', 'energy', 'enterprises', 'environmentally', 'financial', 'ict', 'industrial', 'industrialisation', 'information', 'infrastructure', 'innovation', 'internet', 'irrigation', 'knowledge', 'markets', 'mobile', 'national', 'network', 'networks', 'phone', 'policy', 'power', 'public', 'quality', 'regional', 'research', 'resilient', 'resource', 'resources', 'roads', 'sanitation', 'scientific', 'security', 'service', 'services', 'society', 'sustainable', 'technological', 'technologies', 'technology', 'trade', 'transborder', 'transport', 'value', 'water']
sdg10 = ['affordable', 'age', 'ageism', 'aid', 'assistance', 'business', 'children', 'countries', 'culture', 'developing', 'development', 'disabilities', 'discrimination', 'discriminatory', 'economy', 'education', 'empower', 'equal', 'equality', 'equity', 'ethnicity', 'financial', 'foreign', 'gender', 'global', 'growth', 'health', 'homelessness', 'homophobia', 'housing', 'human', 'inclusion', 'income', 'indigenous', 'inequalities', 'inequality', 'investment', 'markets', 'migrant', 'migration', 'nations', 'opportunity', 'policy', 'population', 'poverty', 'protection', 'public', 'quality', 'race', 'racisim', 'reduce', 'religion', 'remittance', 'rights', 'rural', 'sex', 'sexism', 'social', 'society', 'states', 'trade', 'vulnerable', 'world']
sdg11 = ['adaptable', 'adaptation', 'affordable', 'air', 'buildings', 'building', 'change', 'cities', 'city', 'climate', 'communities', 'community', 'consumption', 'crowding', 'cultural', 'decentralisation', 'development', 'disaster', 'disasters', 'efficiency', 'fine', 'generation', 'green', 'growth', 'heritage', 'housing', 'human', 'impact', 'inadequate', 'informal', 'infrastructure', 'land', 'local', 'management', 'materials', 'matter', 'mitigation', 'natural', 'particulate', 'planning', 'pollution', 'population', 'public', 'quality', 'reduction', 'resilient', 'resource', 'risk', 'road', 'safe', 'safety', 'settlements', 'shanty', 'slums', 'smart', 'solid', 'spaces', 'strategy', 'suburban', 'sustainability', 'sustainable', 'systems', 'town', 'transport', 'urban', 'urbanisation', 'waste', 'water', 'water_related']
sdg12 = ['capitalism', 'cars', 'chain', 'chains', 'circular', 'commercial', 'consumer', 'consumerism', 'consumption', 'cycle', 'decarbonisation', 'development', 'distortions', 'ecological', 'economy', 'efficiency', 'efficient', 'energy', 'enterprises', 'food', 'fossil', 'fuel', 'future', 'gasses', 'generation', 'global', 'goods', 'greenhouse', 'harvest', 'industry', 'levels', 'losses', 'management', 'market', 'materialism', 'materials', 'monitoring', 'natural', 'obsolescence', 'overconsumption', 'pollution', 'practices', 'procurement', 'production', 'proof', 'public', 'recycle', 'recycling', 'reduce', 'reduction', 'renewable', 'resource', 'resources', 'responsible', 'retail', 'reuse', 'subsidies', 'supply', 'sustainable', 'tourism', 'vehicles', 'waste', 'wasteful', 'water']
sdg13 = ['action', 'adaptation', 'agreement', 'average', 'capture', 'carbon', 'change', 'changing', 'climate', 'co2', 'conversion', 'cop', 'dioxide', 'disasters', 'disease', 'economy', 'ecosystems', 'emissions', 'events', 'extreme', 'gas', 'gases', 'gender', 'global', 'greenhouse', 'hazards', 'ice', 'impact', 'infectious', 'management', 'mitigation', 'natural', 'ocean', 'paris', 'patterns', 'planning', 'policy', 'politics', 'pollution', 'refugees', 'related', 'renewable', 'resilience', 'rise', 'rising', 'sea', 'systems', 'temperature', 'warming', 'warning', 'weather']
sdg14 = ['acidification', 'areas', 'artisanal', 'biodiversity', 'bleaching', 'carbon', 'coastal', 'coastlines', 'conserve', 'coral', 'dioxide', 'ecosystem', 'ecosystems', 'fish', 'fisheries', 'fishers', 'fishing', 'global', 'grasses', 'habitats', 'illegal', 'kelp', 'law', 'management', 'marine', 'ocean', 'oceanography', 'oceans', 'overfishing', 'parks', 'policy', 'pollution', 'practices', 'productive', 'protected', 'reef', 'resources', 'sea', 'seas', 'species', 'stocks', 'sustainable', 'temperature', 'unregulated', 'warming', 'water']
sdg15 = ['afforestation', 'agriculture', 'alien', 'animals', 'arable', 'bees', 'biodiversity', 'conservation', 'deforestation', 'degradation', 'desertification', 'drought', 'drylands', 'ecosystem', 'ecosystems', 'extinct', 'extinction', 'fauna', 'flora', 'forest', 'forests', 'genetic', 'illegal', 'illicit', 'indigenous', 'invasive', 'land', 'manage', 'managed', 'management', 'microorganisms', 'permaculture', 'plan', 'plants', 'poaching', 'populations', 'poverty', 'products', 'protected', 'reforestation', 'resources', 'restoration', 'soil', 'species', 'strategic', 'sustainability', 'terrestrial', 'threatened', 'trafficking', 'tree', 'wetlands', 'wildlife']
sdg16 = ['abuse', 'access', 'accountability', 'accountable', 'against', 'arbitrary', 'arms', 'assets', 'birth', 'bribery', 'children', 'combat', 'conflict', 'conflicts', 'corruption', 'crime', 'decision_making', 'detainees', 'detention', 'disappearance', 'discrimination', 'displaced', 'education', 'enforced', 'equal', 'equity', 'evasion', 'exploitation', 'financial', 'freedom', 'geography', 'governance', 'hate', 'human', 'identity', 'illegal', 'illicit', 'inclusion', 'inclusive', 'institutions', 'internally', 'judiciary', 'justice', 'law', 'legal', 'life', 'national', 'nonviolence', 'organized', 'paris', 'peace', 'peaceful', 'physical', 'police', 'policy', 'poverty', 'prevent', 'principles', 'psychological', 'public', 'quality', 'rates', 'registration', 'representative', 'resolution', 'rights', 'rule', 'security', 'seizures', 'sexual', 'societies', 'societies', 'society', 'stolen', 'tax', 'terrorism', 'theft', 'threats', 'torture', 'trafficking', 'transparency', 'sentenced', 'unstable', 'victims', 'violence', 'weapon', 'women']
sdg17 = ['agenda', 'agreements', 'aid', 'assistance', 'average', 'building', 'capacity', 'census', 'civil', 'communication', 'cooperation', 'countries', 'data', 'debt', 'developing', 'development', 'disaggregated', 'doha', 'entrepreneurs', 'entrepreneurship', 'environmentally', 'eradication', 'foreign', 'fostering', 'fundamental', 'global', 'housing', 'innovation', 'international', 'investments', 'knowledge', 'stakeholder', 'official', 'organization', 'partnership', 'partnerships', 'population', 'poverty', 'principles', 'public', 'private', 'science', 'sharing', 'society', 'sound', 'stability', 'statistics', 'support', 'sustainability', 'sustainable', 'tariff', 'technologies', 'technology', 'trade', 'transfer', 'weighted', 'women', 'world']
sdg_list = [sdg1, sdg2, sdg3, sdg4, sdg5, sdg6, sdg7, sdg8, sdg9, sdg10, sdg11, sdg12, sdg13, sdg14, sdg15, sdg16, sdg17]

In [19]:
documents = data_lemmatized
sdg_l = sdg_list

score_vector4 = np.zeros((len(documents), len(sdg_l)))

for d, doc in tqdm(enumerate(documents)):
    d_len = len(documents[d])
    #print(d)
    for s, sdg in enumerate(sdg_l):
        s_len = len(sdg_l[s])
        score = 0
        for keyword in sdg:
            for word in doc:
                try:
                    sim = model.similarity(word, keyword)
                    if (sim >= 0.75):
                        score += sim 
                except:
                    pass
        score_vector4[d][s] = score / s_len

200it [07:43,  2.32s/it]


### KEYWORD matching (word_count/doc_len)

In [93]:
#defining all 17 SDGs
sdg1 = ['africa', 'basic', 'class', 'countries', 'developing', 'disadvantaged', 'discrimination', 'distribution', 'economic', 'end', 'environment', 'equality', 'eradication', 'extreme', 'financial', 'inclusion', 'income', 'life', 'line', 'microfinance', 'poor', 'poverty', 'protection', 'quality', 'resources', 'services', 'social', 'sustainable', 'systems', 'third', 'vulnerable', 'wealth', 'world']
sdg2 = ['agriculture', 'diversity', 'food', 'genetic', 'genetics', 'growth', 'health', 'hunger', 'hungry', 'improved', 'infrastructure', 'innovations', 'legumes', 'life', 'maize', 'malnourished', 'malnutrition', 'needs', 'nourished', 'nutrition', 'nutritional', 'nutritious', 'people', 'poverty', 'produce', 'producers', 'production', 'productivity', 'quality', 'resilient', 'restrictions', 'rural', 'scale', 'seeds', 'small', 'stunted', 'stunting', 'sufficient', 'sustainable', 'trade', 'under', 'undernourished', 'wasting', 'world']
sdg3 = ['abuse', 'access', 'accidents', 'affordable', 'aids', 'air', 'alcohol', 'antenatal', 'antiretroviral', 'autonomy', 'biomedical', 'bodily', 'borne', 'care', 'child', 'clean', 'constrained', 'contamination', 'contraceptive', 'control', 'countries', 'coverage', 'death', 'deaths', 'density', 'dental', 'developing', 'disability', 'disease', 'diseases', 'drug', 'expectancy', 'family', 'health', 'healthcare', 'healthy', 'hepatitis', 'hiv', 'hygiene', 'improving', 'inclusion', 'increasing', 'indigenous', 'infected', 'international', 'life', 'lives', 'location', 'malaria', 'maternal', 'measles', 'medical', 'medicines', 'mental', 'mortality', 'narcotic', 'neonatal', 'organisation', 'planning', 'policy', 'polio', 'politics', 'pollution', 'premature', 'preventable', 'reducing', 'refugees', 'regulations', 'reproductive', 'resource', 'road', 'sanitation', 'services', 'sexual', 'soil', 'substance', 'support', 'therapy', 'tobacco', 'traffic', 'treatment', 'tuberculosis', 'universal', 'use', 'vaccines', 'violence', 'wash', 'water', 'well', 'wellbeing', 'worker', 'world']
sdg4 = ['access', 'basic', 'childhood', 'citizenship', 'cooperation', 'cultural', 'developing', 'development', 'disability', 'disparities', 'disparity', 'diversity', 'education', 'enrolment', 'equal', 'equality', 'equitable', 'equity', 'gender', 'global', 'inclusion', 'inclusive', 'innovation', 'international', 'learning', 'lifelong', 'literacy', 'numeracy', 'opportunities', 'preprimary', 'primary', 'qualified', 'refugees', 'rights', 'scholarships', 'school', 'secondary', 'sensitive', 'skills', 'sustainability', 'teacher', 'teachers', 'training', 'universal', 'vocational', 'vulnerable', 'women']
sdg5 = ['access', 'against', 'basic', 'coverage', 'dignity', 'disadvantaged', 'discrimination', 'employment', 'empower', 'empowerment', 'equal', 'equality', 'exploitation', 'female', 'feminism', 'forced', 'gender', 'genital', 'girls', 'governance', 'health', 'human', 'humanitarian', 'inclusion', 'living', 'marginalised', 'marriage', 'mutilation', 'opportunities', 'parity', 'pay', 'reproductive', 'rights', 'sexual', 'social', 'standards', 'trafficking', 'universal', 'violence', 'women', 'work', 'workplace']
sdg6 = ['access', 'accessible', 'affordable', 'aquifer', 'basins', 'cities', 'clean', 'contaminated', 'defecation', 'desalination', 'diarrhoeal', 'disasters', 'diseases', 'drinking', 'drought', 'dumping', 'ecosystem', 'ecosystems', 'efficiency', 'equitable', 'floods', 'harvesting', 'hydropower', 'hygiene', 'improving', 'inadequate', 'infrastructure', 'irrigation', 'lakes', 'latrines', 'management', 'pollution', 'protection', 'quality', 'recycled', 'resources', 'restoration', 'reuse', 'river', 'rivers', 'safe', 'sanitation', 'scarcity', 'sewerage', 'supply', 'sustainable', 'third', 'toilets', 'treatment', 'untreated', 'urban', 'waste', 'wastewater', 'water', 'water_related', 'water_use', 'withdrawals', 'world']
sdg7 = ['affordable', 'alternative', 'animal', 'battery', 'carbon', 'charcoal', 'clean', 'cleaner', 'climate', 'coal', 'economy', 'efficiency', 'electricity', 'emissions', 'energy', 'fossil', 'fossil_fuel', 'fuel', 'fuels', 'gas', 'goal', 'green', 'greenhouse', 'hydroelectric', 'infrastructure', 'modern', 'power', 'reliable', 'renewable', 'research', 'services', 'solar', 'sustainable', 'technology', 'turbine', 'vehicles', 'waste', 'wave', 'wind', 'wood']
sdg8 = ['aid', 'banking', 'child', 'consumption', 'creation', 'creativity', 'culture', 'development', 'domestic', 'economic', 'economy', 'efficiency', 'employment', 'enterprises', 'entrepreneurship', 'equal', 'eradication', 'finance', 'financial', 'forced', 'gdp', 'global', 'gross', 'growth', 'human', 'inclusive', 'innovation', 'insurance', 'job', 'jobs', 'labour', 'life', 'market', 'micro', 'migrant', 'migrants', 'modern', 'opportunities', 'oriented', 'pay', 'policies', 'policy', 'poverty', 'product', 'production', 'productive', 'productivity', 'public', 'quality', 'resource', 'rights', 'safe', 'secure', 'services', 'slavery', 'social', 'society', 'soldiers', 'stable', 'sustainable', 'tourism', 'trade', 'trafficking', 'unemployment', 'paid', 'women', 'work', 'workers', 'world', 'youth']
sdg9 = ['access', 'affordable', 'banks', 'capabilities', 'chains', 'clean', 'communication', 'cooperation', 'countries', 'credit', 'data', 'developing', 'development', 'diversification', 'economic', 'education', 'efficiency', 'electrical', 'energy', 'enterprises', 'environmentally', 'financial', 'ict', 'industrial', 'industrialisation', 'information', 'infrastructure', 'innovation', 'internet', 'irrigation', 'knowledge', 'markets', 'mobile', 'national', 'network', 'networks', 'phone', 'policy', 'power', 'public', 'quality', 'regional', 'research', 'resilient', 'resource', 'resources', 'roads', 'sanitation', 'scientific', 'security', 'service', 'services', 'society', 'sustainable', 'technological', 'technologies', 'technology', 'trade', 'transborder', 'transport', 'value', 'water']
sdg10 = ['affordable', 'age', 'ageism', 'aid', 'assistance', 'business', 'children', 'countries', 'culture', 'developing', 'development', 'disabilities', 'discrimination', 'discriminatory', 'economy', 'education', 'empower', 'equal', 'equality', 'equity', 'ethnicity', 'financial', 'foreign', 'gender', 'global', 'growth', 'health', 'homelessness', 'homophobia', 'housing', 'human', 'inclusion', 'income', 'indigenous', 'inequalities', 'inequality', 'investment', 'markets', 'migrant', 'migration', 'nations', 'opportunity', 'policy', 'population', 'poverty', 'protection', 'public', 'quality', 'race', 'racisim', 'reduce', 'religion', 'remittance', 'rights', 'rural', 'sex', 'sexism', 'social', 'society', 'states', 'trade', 'vulnerable', 'world']
sdg11 = ['adaptable', 'adaptation', 'affordable', 'air', 'buildings', 'building', 'change', 'cities', 'city', 'climate', 'communities', 'community', 'consumption', 'crowding', 'cultural', 'decentralisation', 'development', 'disaster', 'disasters', 'efficiency', 'fine', 'generation', 'green', 'growth', 'heritage', 'housing', 'human', 'impact', 'inadequate', 'informal', 'infrastructure', 'land', 'local', 'management', 'materials', 'matter', 'mitigation', 'natural', 'particulate', 'planning', 'pollution', 'population', 'public', 'quality', 'reduction', 'resilient', 'resource', 'risk', 'road', 'safe', 'safety', 'settlements', 'shanty', 'slums', 'smart', 'solid', 'spaces', 'strategy', 'suburban', 'sustainability', 'sustainable', 'systems', 'town', 'transport', 'urban', 'urbanisation', 'waste', 'water', 'water_related']
sdg12 = ['capitalism', 'cars', 'chain', 'chains', 'circular', 'commercial', 'consumer', 'consumerism', 'consumption', 'cycle', 'decarbonisation', 'development', 'distortions', 'ecological', 'economy', 'efficiency', 'efficient', 'energy', 'enterprises', 'food', 'fossil', 'fuel', 'future', 'gasses', 'generation', 'global', 'goods', 'greenhouse', 'harvest', 'industry', 'levels', 'losses', 'management', 'market', 'materialism', 'materials', 'monitoring', 'natural', 'obsolescence', 'overconsumption', 'pollution', 'practices', 'procurement', 'production', 'proof', 'public', 'recycle', 'recycling', 'reduce', 'reduction', 'renewable', 'resource', 'resources', 'responsible', 'retail', 'reuse', 'subsidies', 'supply', 'sustainable', 'tourism', 'vehicles', 'waste', 'wasteful', 'water']
sdg13 = ['action', 'adaptation', 'agreement', 'average', 'capture', 'carbon', 'change', 'changing', 'climate', 'co2', 'conversion', 'cop', 'dioxide', 'disasters', 'disease', 'economy', 'ecosystems', 'emissions', 'events', 'extreme', 'gas', 'gases', 'gender', 'global', 'greenhouse', 'hazards', 'ice', 'impact', 'infectious', 'management', 'mitigation', 'natural', 'ocean', 'paris', 'patterns', 'planning', 'policy', 'politics', 'pollution', 'refugees', 'related', 'renewable', 'resilience', 'rise', 'rising', 'sea', 'systems', 'temperature', 'warming', 'warning', 'weather']
sdg14 = ['acidification', 'areas', 'artisanal', 'biodiversity', 'bleaching', 'carbon', 'coastal', 'coastlines', 'conserve', 'coral', 'dioxide', 'ecosystem', 'ecosystems', 'fish', 'fisheries', 'fishers', 'fishing', 'global', 'grasses', 'habitats', 'illegal', 'kelp', 'law', 'management', 'marine', 'ocean', 'oceanography', 'oceans', 'overfishing', 'parks', 'policy', 'pollution', 'practices', 'productive', 'protected', 'reef', 'resources', 'sea', 'seas', 'species', 'stocks', 'sustainable', 'temperature', 'unregulated', 'warming', 'water']
sdg15 = ['afforestation', 'agriculture', 'alien', 'animals', 'arable', 'bees', 'biodiversity', 'conservation', 'deforestation', 'degradation', 'desertification', 'drought', 'drylands', 'ecosystem', 'ecosystems', 'extinct', 'extinction', 'fauna', 'flora', 'forest', 'forests', 'genetic', 'illegal', 'illicit', 'indigenous', 'invasive', 'land', 'manage', 'managed', 'management', 'microorganisms', 'permaculture', 'plan', 'plants', 'poaching', 'populations', 'poverty', 'products', 'protected', 'reforestation', 'resources', 'restoration', 'soil', 'species', 'strategic', 'sustainability', 'terrestrial', 'threatened', 'trafficking', 'tree', 'wetlands', 'wildlife']
sdg16 = ['abuse', 'access', 'accountability', 'accountable', 'against', 'arbitrary', 'arms', 'assets', 'birth', 'bribery', 'children', 'combat', 'conflict', 'conflicts', 'corruption', 'crime', 'decision_making', 'detainees', 'detention', 'disappearance', 'discrimination', 'displaced', 'education', 'enforced', 'equal', 'equity', 'evasion', 'exploitation', 'financial', 'freedom', 'geography', 'governance', 'hate', 'human', 'identity', 'illegal', 'illicit', 'inclusion', 'inclusive', 'institutions', 'internally', 'judiciary', 'justice', 'law', 'legal', 'life', 'national', 'nonviolence', 'organized', 'paris', 'peace', 'peaceful', 'physical', 'police', 'policy', 'poverty', 'prevent', 'principles', 'psychological', 'public', 'quality', 'rates', 'registration', 'representative', 'resolution', 'rights', 'rule', 'security', 'seizures', 'sexual', 'societies', 'societies', 'society', 'stolen', 'tax', 'terrorism', 'theft', 'threats', 'torture', 'trafficking', 'transparency', 'sentenced', 'unstable', 'victims', 'violence', 'weapon', 'women']
sdg17 = ['agenda', 'agreements', 'aid', 'assistance', 'average', 'building', 'capacity', 'census', 'civil', 'communication', 'cooperation', 'countries', 'data', 'debt', 'developing', 'development', 'disaggregated', 'doha', 'entrepreneurs', 'entrepreneurship', 'environmentally', 'eradication', 'foreign', 'fostering', 'fundamental', 'global', 'housing', 'innovation', 'international', 'investments', 'knowledge', 'stakeholder', 'official', 'organization', 'partnership', 'partnerships', 'population', 'poverty', 'principles', 'public', 'private', 'science', 'sharing', 'society', 'sound', 'stability', 'statistics', 'support', 'sustainability', 'sustainable', 'tariff', 'technologies', 'technology', 'trade', 'transfer', 'weighted', 'women', 'world']
sdg_list = [sdg1, sdg2, sdg3, sdg4, sdg5, sdg6, sdg7, sdg8, sdg9, sdg10, sdg11, sdg12, sdg13, sdg14, sdg15, sdg16, sdg17]

In [39]:
#sdg bag of words
sdg_bow = [id2word.doc2bow(text) for text in sdg_list]

# vectorized the sdg bag of words into a vector
sdg_vectorized = np.zeros((len(sdg_list), len(id2word)))
for i, sdg_bow_i in enumerate(sdg_bow):
    for id, count in sdg_bow_i:
        sdg_vectorized[i][id] = count
        
# vectorized the sdg bag of words into a vector
sdg_vectorized = np.zeros((len(sdg_list), len(id2word)))
for i, sdg_bow_i in enumerate(sdg_bow):
    for id, count in sdg_bow_i:
        sdg_vectorized[i][id] = count
        
#vectorized each document (bag of words) into a vector
corpus_vectorized = np.zeros((len(corpus), len(id2word)))
for i, corpus_i in enumerate(corpus):
    for id, count in corpus_i:
        corpus_vectorized[i][id] = count
        
#total sdg words matched in a document
sdg_word_match = np.dot(corpus_vectorized,sdg_vectorized.transpose())

#total number of words in each document
total_words = sdg_vectorized.sum(axis=1)

score_vector4 = list()

# probability estimation
for i, corpus_i in enumerate(total_words):
    score_vector4.append(sdg_word_match.T[i]/corpus_i)

# Score Normalization

In [40]:
import numpy

project_sdg_df4 = pd.DataFrame(data=score_vector4)
project_sdg_df4 = project_sdg_df4.transpose()

columns = ['sdg1', 'sdg2', 'sdg3', 'sdg4', 'sdg5', 'sdg6', 'sdg7', 'sdg8', 'sdg9', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg17']

project_sdg_df4.columns = columns

In [41]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

norm_sdg_df4 = pd.DataFrame(min_max_scaler.fit_transform(project_sdg_df4.T).T)

In [42]:
doc_num = len(corpus)
i = 0
cutoff = 0.7
sdg_labels = list()

for i in range(doc_num):
    th1 = list(norm_sdg_df4.iloc[i] >= cutoff)
    sdg_labels.append(th1)

In [43]:
sdg_labels_df4 = pd.DataFrame(data=sdg_labels)
sdg_labels_df4.replace(False, '0', inplace=True)
sdg_labels_df4.replace(True, '1', inplace=True)

sdg_labels_df4.columns = columns
sdg_labels_df4['Title'] = df4['Title']

columns = ['Title', 'sdg1', 'sdg2', 'sdg3', 'sdg4', 'sdg5', 'sdg6', 'sdg7', 'sdg8', 'sdg9', 'sdg10', 'sdg11', 'sdg12', 'sdg13', 'sdg14', 'sdg15', 'sdg16', 'sdg17']
sdg_labels_df4 = sdg_labels_df4[columns]

sdg_labels_df4

Unnamed: 0,Title,sdg1,sdg2,sdg3,sdg4,sdg5,sdg6,sdg7,sdg8,sdg9,sdg10,sdg11,sdg12,sdg13,sdg14,sdg15,sdg16,sdg17
0,Empirical analysis on manufacturing energy eff...,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0
1,Informing action for United Nations SDG target...,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0
2,Perspectives on eco-water security and sustain...,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,The carbon footprint of dietary guidelines aro...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Optimal disassembly sequence generation and di...,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Machine learning approach to predict susceptib...,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
196,A solution to the dilemma `limiting similarity...,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0
197,Plastic (PET) vs bioplastic (PLA) or refillabl...,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0
198,Enhancing the sustainability of KsdD as a bioc...,0,1,1,0,0,0,1,1,1,0,0,1,0,0,0,0,1


In [44]:
sdg_labels_df4.to_excel("SDG_papers_KMS_70.xlsx", engine='xlsxwriter')

In [110]:
project_sdg_df4.to_excel("Score_KM_papers.xlsx", engine='xlsxwriter')