## SGD classifier

In [2]:
# imports
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import re
import math
import numpy as np
from numpy import nan
import os
from pprint import pprint
import pickle
from sklearn.metrics import hinge_loss

import gensim as ge
import nltk
import matplotlib.pyplot as plt

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers= 16, verbose = 0)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn import metrics

import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
path = '/home/ubuntu/ullrich/data/'

## preprocess train dataset

In [5]:
#define connection to db 
connect_string = 'postgresql+psycopg2://postgres:5050@localhost:5432/postgres'
#define sql queries
sql_query_agro = 'SELECT * FROM publ.corpus_keywords_agro'
sql_query_mesh = 'SELECT * FROM publ.corpus_keywords_mesh'
sql_query_class = 'SELECT dbrecordid, class FROM ke_stage.corpus_small_train'
#create engine
engine = create_engine(connect_string)
#read data as df
df_agro = pd.read_sql(sql_query_agro, engine)
df_mesh = pd.read_sql(sql_query_mesh, engine)
df_class = pd.read_sql(sql_query_class, engine)

In [6]:
#merge the ids from abs, if title-ID has none-values
def join_id(df):
    value = df['id1'].fillna(df['id2'])
    df['id1'] = value
    df = df.drop(['id2'], axis=1)
    return df

In [7]:
df_agro = join_id(df_agro)
df_mesh = join_id(df_mesh)

In [8]:
#merge both df (agrovoc and mesh)
result = pd.merge(df_agro, df_mesh, on=['id1'], how='outer')
#create list of all keywords for each document
result['keywords'] = result.apply(lambda row: [val for val in row if isinstance(val, list)], axis=1)
result = result.rename(columns={'id1':'dbrecordid'})

temp_list =  []
for row in result['keywords']:
    keywords_list = []
    for l in row:
        keywords_list.extend(l)
    for x in range(len(keywords_list)):
        keywords_list[x] = keywords_list[x].lower()
    temp_list.append(keywords_list)
    
result['keywords_all'] = temp_list

In [9]:
#combine the tokens to a string of tokens seperated with a comma
def combine_tokens(df):
    df['combined'] = df['keywords_all'].apply(lambda x: ','.join(x) if isinstance(x, list) else '')
    return df

In [10]:
result = combine_tokens(result)

In [11]:
#merge the df with the averbis classes
final_df = pd.merge(result, df_class, on=['dbrecordid'], how='inner')

In [12]:
#delete the rows with the class "Rest"
final_df = final_df[final_df['class'] != 'Rest']

In [14]:
#split into train and test data
x = final_df['combined']
y = final_df['class']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.25)

## train model SGD

In [None]:
#prepare the pipeline for the model
sgd_model = Pipeline([
                ('vect', CountVectorizer(lowercase=False,stop_words=None,tokenizer=None)),
                ('tfidf', TfidfTransformer(use_idf=True,norm="l2")),
                ('clf', SGDClassifier(
                                    tol=None, loss='hinge', 
                                    penalty='l2',
                                    alpha=1e-3, random_state=42,
                                    max_iter=5,))
   ])

In [39]:
#train the model
sgd_model.fit(x_train, y_train)

In [17]:
#predict the classes on testdata
y_pred = sgd_model.predict(x_test)

In [18]:
#print the confusion matrix
print("Confusion Matrix")
matrix = metrics.confusion_matrix(y_test, y_pred)
print(matrix)

Confusion Matrix
[[14303  2450  5297  2475]
 [ 1863 16766  1791  3658]
 [ 1955  2334 19141   686]
 [ 1057  1440   659 20861]]


In [20]:
#get the f1 matrix for all classes
f1_matrix = metrics.classification_report(y_test, y_pred, target_names=['Medizin', 'Landwirtschaft','Umweltwissenschaften','ErnÃ¤hrung'], output_dict=True)
f1_matrix = pd.DataFrame(f1_matrix).transpose()
f1_matrix.to_csv(path + "F1_matrix.csv")

In [22]:
# Accuracy of the model
accuracy = metrics.accuracy_score(y_test, y_pred)
print('SGD Classifier Accuracy of the model: {:.2f}%'.format(accuracy*100))

SGD Classifier Accuracy of the model: 73.47%
