Notebook to show how a model (SVC) can be traine dto classify keywords into one of 21 
parent DW categories

In [5]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import collections
import math
import os, sys, pickle, urllib.request
import os.path as op
from collections import Counter
from tqdm.notebook import tqdm
import gensim

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_extraction
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from nltk.corpus import stopwords

In [6]:
def evaluate_metrics(yt, yp):
    results_pos = {}
    results_pos['accuracy'] = accuracy_score(yt, yp)
    precision, recall, f_beta, _ = precision_recall_fscore_support(yt, yp, average='weighted')
    results_pos['recall'] = recall
    results_pos['precision'] = precision
    results_pos['f1score'] = f_beta
    return results_pos

In [7]:
# Opening JSON file
data_dir = '/home/marios/data/dw-project-data/CMS_2010_to_June_2022_ENGLISH.json'
f = open(data_dir)
# returns JSON object as
# a dictionary
df = json.load(f)
df = pd.DataFrame.from_dict(df)

In [8]:
#remove columns that are nnot needed
df = df[['id', 'keywordStrings', 'thematicFocusCategory', 'lastModifiedDate']]
#drop missing values
df = df.dropna()
#clean Focus category
df['thematicFocusCategory'] = df['thematicFocusCategory'].apply(lambda x:x['name'] if x is not None else x)
df = df[df['thematicFocusCategory']!=None]
df['thematicFocusCategory'] = df['thematicFocusCategory'].astype(str)

#replacee secondary categories with primary
children_dict = {'Architecture':'Culture', 'Design':'Culture', 'Film':'Culture', 'Arts':'Culture', 
                 'Literature':'Culture', 'Music':'Culture', 'Dance':'Culture', 'Theater':'Culture',
                   'Climate':'Nature and Environment',
                  'Conflicts':'Politics', 'Terrorism':'Politics', 
                  'Corruption':'Law and Justice', 'Crime':'Law and Justice', 'Rule of Law':'Law and Justice',
                    'Press Freedom':'Law and Justice', 
                  'Diversity':'Human Rights', 'Freedom of Speech':'Human Rights', 'Equality':'Human Rights', 
                'Soccer': 'Sports',
                    'Trade':'Business', 'Globalization':'Business', 'Food Security':'Business'
}

secondary_cts = [val for val in children_dict.keys()]

df['thematicFocusCategory'] = df['thematicFocusCategory'].apply(lambda x: children_dict[x] if x in secondary_cts else x)


In [10]:
# let's load a pre-trained word2vec model from google- you may need to download this first
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g
GoogleModel = gensim.models.KeyedVectors.load_word2vec_format('/home/marios/local_data_s2ds/GoogleNews-vectors-negative300.bin',
                                                               binary=True,)

KeyboardInterrupt: 

In [11]:
#drop categories which are rare 
elements_count = collections.Counter(df.thematicFocusCategory)
# printing the element and the frequency
for key, value in elements_count.items():
    if value <100:
        print(f"{key}: {value}")
        df = df[df.thematicFocusCategory != key]


Learning German: 75
Offbeat: 38
Innovation: 5


In [13]:
# creeate corpus which will be passed on for vectorization
# heree one token will bee a string containing all the keywors associateed with one article. 
# you can change this accordingly. 
corpus = [l for l in df['keywordStrings'].apply(lambda x: ', '.join(x))]
corpus[:5]

['Africalink, Top Story, Africa on the Move, Making a Difference, Behind the Headlines, Your Say, Crossroads Generation',
 'Commerzbank, job cuts, administration, retail banking',
 "Moody's, Turkey, ratings agency, junk status",
 'Syria, Aleppo, war crimes, water, UNICEF',
 'Conflict Zone, Talk, link']

In [15]:
# Count vectorization of text
# Creating the vectorizer
vectorizer = CountVectorizer(stop_words='english')
# Converting the text to numeric data
X = vectorizer.fit_transform(corpus) 
# Preparing Data frame For machine learning
# Priority column acts as a target variable and other columns as predictors
CountVectorizedData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
CountVectorizedData['Category']=df['thematicFocusCategory'].values
print(CountVectorizedData.shape)
CountVectorizedData.head()

(79795, 38605)




Unnamed: 0,000,007,01,03,04,05,08,0rg,10,100,...,øystein,út,überall,ünal,ünker,ľudmila,şehriban,štefániková,żurek,Category
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,History
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Business
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Business
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Politics
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Politics


In [18]:
WordsVocab=CountVectorizedData.columns[:-1]
print(f'Number of words after count vectorization: {len(WordsVocab)}')

NUmber of words aftere count vectorization: 38604


In [19]:
# this function makes use thee pretrained google model to get word embeddings onn the vectorizeed input 
def FunctionText2Vec(inpTextData):
    # Converting the text to numeric data
    X = vectorizer.transform(inpTextData)
    CountVecData=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    # Creating empty dataframe to hold sentences
    W2Vec_Data=pd.DataFrame()
    # Looping through each row for the data
    for i in range(CountVecData.shape[0]):
        # initiating a sentence with all zeros
        Sentence = np.zeros(300)
        # Looping thru each word in the sentence and if its present in 
        # the Word2Vec model then storing its vector
        for word in WordsVocab[CountVecData.iloc[i , :]>=1]:
            #print(word)
            if word in GoogleModel.key_to_index.keys():    
                Sentence=Sentence+GoogleModel[word]
        # Appending the sentence to the dataframe
        W2Vec_Data=W2Vec_Data.append(pd.DataFrame([Sentence]))
    return(W2Vec_Data)

In [13]:
#this takes aboout an hour to run on my local machine, onnly rerun if you want to overwrite
W2Vec_Data=FunctionText2Vec(corpus)
output_dir = '/home/marios/local_data_s2ds/'
file_name = 'w2v_data_ALL_dirty.npy'
np.save(op.join(output_dir, file_name), W2Vec_Data)



In [35]:
#load saved embeddings if you have them 
word_embedding_dir = '/home/marios/local_data_s2ds/w2v_data_ALL_dirty.npy'
W2Vec_Data = pd.DataFrame(np.load(word_embedding_dir))

In [36]:

# Adding the target variable
W2Vec_Data.reset_index(inplace=True, drop=True)
W2Vec_Data['Category']=CountVectorizedData['Category']
 
# Assigning to DataForML variable
DataForML=W2Vec_Data.copy()
DataForML.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,Category
0,0.299662,0.283447,-0.123535,0.349365,-0.561035,-0.381592,0.598633,-1.20459,1.055176,1.244751,...,0.004211,-0.700806,-0.354126,-0.878662,-0.90564,-0.141602,-0.482178,0.404663,0.433105,History
1,0.200317,0.228638,-0.69474,0.358398,-0.019562,-0.473206,-0.455322,-0.468185,0.35257,0.208008,...,-0.106201,-0.200439,-0.055298,0.016968,0.594482,0.263367,-0.12085,0.468994,-0.409668,Business
2,0.11026,-0.127075,-0.20105,0.694519,-0.411407,0.086578,0.196777,-0.756042,0.694824,0.215088,...,1.164688,-0.711182,0.07959,-0.189392,0.060669,0.70467,0.462891,-0.134277,0.358215,Business
3,0.166748,0.611023,0.495667,0.155273,-0.469604,0.094238,0.381165,-0.810547,0.07019,0.612061,...,-0.231689,-0.383667,0.481567,-0.483643,0.057495,-0.515137,-0.443268,0.560913,0.461304,Politics
4,0.175186,-0.080078,0.26825,0.485107,-0.498779,0.637207,0.168701,-0.880371,0.427612,0.146729,...,-0.191406,-0.310791,-0.050293,0.201172,-0.698975,-0.377197,-1.09668,0.198547,-0.334229,Politics


In [37]:
# let's code the y columnns
# let's changee names to numbers 
primary_categories = np.unique(DataForML.Category).tolist()

primary_categories = sorted(primary_categories)
prim_cat_dict = {}
for i, cat in enumerate(primary_categories):
    prim_cat_dict[cat] = i+1

DataForML.Category = DataForML.Category.apply(lambda x: prim_cat_dict[x] if x in prim_cat_dict else x)
DataForML.Category  = DataForML.Category.astype(int)
np.unique(DataForML.Category)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21])

In [38]:

# Separate Target Variable and Predictor Variables
TargetVariable=DataForML.columns[-1]
Predictors=DataForML.columns[:-1]
X=DataForML[Predictors].values
y=DataForML[TargetVariable].values

PredictorScaler=MinMaxScaler()
# Storing the fit object for later reference
PredictorScalerFit=PredictorScaler.fit(X)
# Generating the standardized values of X
X=PredictorScalerFit.transform(X)
 
# Split the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0, stratify=y)

In [41]:
#let's balance classes- if you like 
smote_sampler = SMOTE(random_state = 5)
X_smo_train, y_smo_train = smote_sampler.fit_resample(X_train, y_train)
X_smo_test, y_smo_test = smote_sampler.fit_resample(X_test, y_test)
elements_count = collections.Counter(y_smo_train)
for key, value in elements_count.items():
    print(f"{key}: {value}")

4: 20314
19: 20314
15: 20314
10: 20314
14: 20314
1: 20314
18: 20314
16: 20314
21: 20314
3: 20314
11: 20314
2: 20314
12: 20314
7: 20314
17: 20314
9: 20314
6: 20314
8: 20314
20: 20314
5: 20314
13: 20314


In [43]:
#let's try random forest on this data - not the best model

model = RandomForestClassifier(max_depth= 10, max_features = 'auto', n_estimators= 20)
model.fit(X_smo_train, y_smo_train)
preds = model.predict(X_smo_test)
evaluate_metrics(y_smo_test, preds)

{'accuracy': 0.5990548528026042,
 'recall': 0.5990548528026042,
 'precision': 0.5937706185645542,
 'f1score': 0.5848436441831409}

In [44]:
# try a SVC- performs better- need to try with balanced classes (maybe balance both train and teest data?)
model = SVC(C=20, kernel='rbf')
model.fit(X_smo_train, y_smo_train)
preds = model.predict(X_smo_test)
evaluate_metrics(y_smo_test, preds)

In [None]:
# if you'd like to optimize hyperparameters- run this over the weekend maybe
params_grid = {
    'C': [25, 50, 150],
    'kernel': ['poly', 'rbf', 'sigmoid']
}
model = SVC()
# Define a GridSearchCV to search the best parameters
grid_search_balanced = GridSearchCV(estimator = model, 
                           param_grid = params_grid, 
                           scoring='f1',
                           cv = 3, verbose = 1)
# Search the best parameters with training data
model_fit_balanced = grid_search_balanced.fit(X_train, y_train)
best_params_balanced = grid_search_balanced.best_params_

In [None]:
#finally let's try to train only on one keyword
# we need to reload the dataset, and remake emeddings only with one keyword at a time
