## SGD classifier

In [1]:
# imports
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
import re
import math
import numpy as np
from numpy import nan
import os
from pprint import pprint
import pickle

import gensim as ge
import nltk
import matplotlib.pyplot as plt

from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers= 16, verbose = 0)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn import metrics

import seaborn as sns
import matplotlib.pyplot as plt

## preprocess train dataset

In [2]:
#define connection to db 
connect_string = 'postgresql+psycopg2://postgres:5050@localhost:5432/postgres'
#define sql queries
sql_query_agro = 'SELECT * FROM publ.corpus_keywords_agro'
sql_query_mesh = 'SELECT * FROM publ.corpus_keywords_mesh'
sql_query_class = 'SELECT * FROM ke_stage.corpus_small'
#create engine
engine = create_engine(connect_string)
#read data as df
df_agro = pd.read_sql(sql_query_agro, engine)
df_mesh = pd.read_sql(sql_query_mesh, engine)
df_class = pd.read_sql(sql_query_class, engine)

In [4]:
#merge the ids 
def join_id(df):
    value = df['id1'].fillna(df['id2'])
    df['id1'] = value
    df = df.drop(['id2'], axis=1)
    return df

In [5]:
df_agro = join_id(df_agro)
df_mesh = join_id(df_mesh)

In [6]:
#merge both df (agrovoc and mesh)
result = pd.merge(df_agro, df_mesh, on=['id1'], how='outer')
#create list of all keywords for each document
result['keywords'] = result.apply(lambda row: [val for val in row if isinstance(val, list)], axis=1)
result.rename('id1':'dbrecordid')

temp_list =  []
for row in result['keywords']:
    keywords_list = []
    for l in row:
        keywords_list.extend(l)
    for x in range(len(keywords_list)):
        keywords_list[x] = keywords_list[x].lower()
    temp_list.append(keywords_list)
    
result['keywords_all'] = temp_list

In [None]:
classes = pd.merge(result, df_class, on=['dbrecordid'], how='outer')

In [7]:
#split into train and test data
df_train, df_test = train_test_split(result, test_size=0.25)

#get list of keywords
keywords_train = df_train['keywords_all'].tolist()
keywords_test = df_test['keywords_all'].tolist()

## train model SGD

In [None]:
sgd_model = Pipeline([
                ('vect', CountVectorizer(lowercase=False,stop_words=None,tokenizer=None)),
                ('tfidf', TfidfTransformer(use_idf=True,norm="l2")),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                        alpha=1e-3, random_state=42,
                                        max_iter=5, tol=None))
   ])

In [None]:
sgd_model.fit(keywords_train, classes['class'])