In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [2]:
filename: str = 'data\smarttask_dbpedia_train.json'
df_questions = pd.read_json(filename)
df_questions


Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."
...,...,...,...,...
17566,dbpedia_7462,Is the flexural strain at break of the acrylon...,boolean,[boolean]
17567,dbpedia_17610,Where did Hilary Putnam receive their Ph.D.?,resource,"[dbo:University, dbo:EducationalInstitution, d..."
17568,dbpedia_505,Who replaced Charles Evans Hughes as the Chief...,resource,"[dbo:Person, dbo:Agent]"
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,"[dbo:River, dbo:Stream, dbo:BodyOfWater, dbo:N..."


In [3]:
df_questions['category'].unique()

array(['boolean', 'resource', 'literal'], dtype=object)

In [4]:
df_questions = df_questions.dropna(subset=['question', 'category'])

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
x_train = count_vect.fit_transform(df_questions['question'].tolist())
x_train.shape

(17528, 20992)

In [6]:
y_train = df_questions['category'].to_list()
len(y_train)

17528

In [7]:
clf = LogisticRegression(random_state=0, solver='saga', max_iter=1000).fit(x_train, y_train)
clf.predict(x_train[0:10]).tolist()

['boolean',
 'resource',
 'literal',
 'boolean',
 'resource',
 'literal',
 'resource',
 'boolean',
 'literal',
 'resource']

In [8]:
y_train[0:10]

['boolean',
 'resource',
 'literal',
 'boolean',
 'resource',
 'literal',
 'resource',
 'boolean',
 'literal',
 'resource']

In [9]:
docs_new = ['Is Stavanger in Norway?', 'How many cats on the roof?', 'Who is the king of England?', 'When did WWII end?', 'Are there nuts in cookies?']
x_toy_ex = count_vect.transform(docs_new)
predicted = clf.predict(x_toy_ex)

predicted

array(['boolean', 'literal', 'resource', 'literal', 'boolean'],
      dtype='<U8')

In [10]:
filename_test: str = 'data\smarttask_dbpedia_test.json'
df_questions_test = pd.read_json(filename_test)
df_questions_test = df_questions_test.dropna(subset=['question', 'category'])
x_test = count_vect.transform(df_questions_test['question'])
y_test = df_questions_test['category']

In [11]:
predicted = clf.predict(x_test)
clf.score(x_test, y_test)

0.9390550102716275

### Classify literal type for questions in training set labeled as literal

In [12]:
x_lit_df = df_questions[df_questions['category']=='literal'].dropna(subset=['type'])
x_lit_df = x_lit_df.explode('type')
x_lit_df

Unnamed: 0,id,question,category,type
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,date
5,dbpedia_14897,Which is the hierarchical BrainInfo ID of the ...,literal,string
8,dbpedia_3712,what is the musical composer id of bedrish sme...,literal,string
10,dbpedia_14847,How often are the Paralympic games held?,literal,number
18,dbpedia_6806,In what year did Tim Hunt give a Croonian Lect...,literal,date
...,...,...,...,...
17553,dbpedia_141,What is Bandysidan player ID for Sergey Lomanov ?,literal,string
17558,dbpedia_3605,What is the total fertility rate for operators...,literal,number
17560,dbpedia_6823,What is long lives in John Keats ?,literal,number
17561,dbpedia_10069,Who is the employer of the {Hans Krebs} ?,literal,number


In [13]:
x_lit_train = count_vect.transform(x_lit_df['question'])
y_lit_train = x_lit_df['type'].tolist()

In [14]:
clf_literals = LogisticRegression(random_state=0, solver='saga', max_iter=1000).fit(x_lit_train, y_lit_train)

In [15]:
df_lit_test = df_questions_test[df_questions_test['category']=='literal'].dropna(subset=['type'])
df_lit_test = df_lit_test.explode('type')
df_lit_test['type'].unique()


array(['number', 'string', 'date'], dtype=object)

In [16]:
x_lit_test = count_vect.transform(df_lit_test['question'])
y_lit_test = df_lit_test['type']

In [17]:
predicted = clf_literals.predict(x_lit_test)
clf_literals.score(x_lit_test, y_lit_test)

0.9318910256410257