## SGD classifier

In [1]:
# imports
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import re
import math
import numpy as np
from numpy import nan
import os
from pprint import pprint
import pickle

import gensim as ge
import nltk
import matplotlib.pyplot as plt

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers= 16, verbose = 0)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn import metrics

import seaborn as sns
import matplotlib.pyplot as plt

## preprocess train dataset

In [3]:
#define connection to db 
connect_string = 'postgresql+psycopg2://postgres:5050@localhost:5432/postgres'
#define sql queries
sql_query_agro = 'SELECT * FROM publ.corpus_keywords_agro'
sql_query_mesh = 'SELECT * FROM publ.corpus_keywords_mesh'
sql_query_class = 'SELECT dbrecordid, class FROM ke_stage.corpus_small'
#create engine
engine = create_engine(connect_string)
#read data as df
df_agro = pd.read_sql(sql_query_agro, engine)
df_mesh = pd.read_sql(sql_query_mesh, engine)
df_class = pd.read_sql(sql_query_class, engine)

In [4]:
#merge the ids 
def join_id(df):
    value = df['id1'].fillna(df['id2'])
    df['id1'] = value
    df = df.drop(['id2'], axis=1)
    return df

In [5]:
df_agro = join_id(df_agro)
df_mesh = join_id(df_mesh)

In [6]:
#merge both df (agrovoc and mesh)
result = pd.merge(df_agro, df_mesh, on=['id1'], how='outer')
#create list of all keywords for each document
result['keywords'] = result.apply(lambda row: [val for val in row if isinstance(val, list)], axis=1)
result = result.rename(columns={'id1':'dbrecordid'})

temp_list =  []
for row in result['keywords']:
    keywords_list = []
    for l in row:
        keywords_list.extend(l)
    for x in range(len(keywords_list)):
        keywords_list[x] = keywords_list[x].lower()
    temp_list.append(keywords_list)
    
result['keywords_all'] = temp_list

In [16]:
def combine_tokens(df):
    df['combined'] = df['keywords_all'].apply(lambda x: ','.join(x) if isinstance(x, list) else '')
    return df

In [17]:
combine_tokens(result)

Unnamed: 0,dbrecordid,agro_title,agro_abs,mesh_title,mesh_abs,keywords,keywords_all,combined
0,1066854,,"[history, game]","[Life, Life]","[Books, Books, Saskatchewan, Saskatchewan, Sas...","[[history, game], [Life, Life], [Books, Books,...","[history, game, life, life, books, books, sask...","history,game,life,life,books,books,saskatchewa..."
1,1066982,[Caribbean],"[Netherlands (Kingdom of the), Caribbean, Cari...","[Caribbean Region, Caribbean Region]","[Netherlands Antilles, Suriname, Caribbean Reg...","[[Caribbean], [Netherlands (Kingdom of the), C...","[caribbean, netherlands (kingdom of the), cari...","caribbean,netherlands (kingdom of the),caribbe..."
2,1067820,[growth],"[China, China]","[Growth, Growth]","[China, China, China, China]","[[growth], [China, China], [Growth, Growth], [...","[growth, china, china, growth, growth, china, ...","growth,china,china,growth,growth,china,china,c..."
3,1067917,,"[history, physicians]",,"[Hemorrhage, Hemorrhage, Drawing, Risk, Risk, ...","[[history, physicians], [Hemorrhage, Hemorrhag...","[history, physicians, hemorrhage, hemorrhage, ...","history,physicians,hemorrhage,hemorrhage,drawi..."
4,1068740,,"[living standards, towns, Syrian Arab Republic...",,"[Ribs, Commerce, Commerce, Ribs, Economics, Sy...","[[living standards, towns, Syrian Arab Republi...","[living standards, towns, syrian arab republic...","living standards,towns,syrian arab republic,sy..."
...,...,...,...,...,...,...,...,...
478532,AGRICOLACAT11102635,,,,"[Counseling, Counseling]","[[Counseling, Counseling]]","[counseling, counseling]","counseling,counseling"
478533,AGRICOLACAT11121845,,,,"[Literacy, Literacy, Weights and Measures, Wei...","[[Literacy, Literacy, Weights and Measures, We...","[literacy, literacy, weights and measures, wei...","literacy,literacy,weights and measures,weights..."
478534,AGRICOLACAT31152803,,,,"[Poster, Poster]","[[Poster, Poster]]","[poster, poster]","poster,poster"
478535,AGRICOLACAT90934190,,,,"[Conservation of Natural Resources, Conservati...","[[Conservation of Natural Resources, Conservat...","[conservation of natural resources, conservati...","conservation of natural resources,conservation..."


In [18]:
final_df = pd.merge(result, df_class, on=['dbrecordid'], how='inner')

In [26]:
x = final_df['combined']
y = final_df['class']

In [27]:
#split into train and test data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25)

## train model SGD

In [28]:
sgd_model = Pipeline([
                ('vect', CountVectorizer(lowercase=False,stop_words=None,tokenizer=None)),
                ('tfidf', TfidfTransformer(use_idf=True,norm="l2")),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                        alpha=1e-3, random_state=42,
                                        max_iter=5, tol=None))
   ])

In [29]:
sgd_model.fit(x_train, y_train)

In [31]:
y_pred = sgd_model.predict(x_test)

In [32]:
print("Confusion Matrix")
matrix = metrics.confusion_matrix(y_test, y_pred)
print(matrix)

Confusion Matrix
[[13823  2386  5278  1250  2035]
 [ 1581 16588  1655   626  3174]
 [ 1640  2359 18806   794   517]
 [ 3016  2449  4927  8122  4548]
 [  880  1458   629  1074 20020]]


In [35]:
f1_matrix = metrics.classification_report(y_test, y_pred, target_names=['Medizin', 'Landwirtschaft','Umweltwissenschaften','ErnÃ¤hrung', 'Rest'], output_dict=True)
f1_matrix = pd.DataFrame(f1_matrix).transpose()
#f1_matrix.to_csv("sgd_score_cvs\F1_matrix_"+str(size)+".csv")

In [36]:
f1_matrix

Unnamed: 0,precision,recall,f1-score,support
Medizin,0.660124,0.558009,0.604786,24772.0
Landwirtschaft,0.657211,0.702167,0.678946,23624.0
Umweltwissenschaften,0.600927,0.779814,0.678782,24116.0
ErnÃ¤hrung,0.684477,0.352181,0.465071,23062.0
Rest,0.660857,0.832052,0.736639,24061.0
accuracy,0.646625,0.646625,0.646625,0.646625
macro avg,0.652719,0.644845,0.632845,119635.0
weighted avg,0.652458,0.646625,0.633932,119635.0


In [37]:
# Accuracy of the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print('SGD Classifier Accuracy of the model: {:.2f}%'.format(accuracy*100))

SGD Classifier Accuracy of the model: 64.66%
