#### Importaciones necesarias

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import time
import re

#### Convertimos el JSON para poder trabajar con Pandas

In [2]:
df = pd.read_json("SMART2022-AT-dbpedia-train.json")
df.to_csv(index = None, encoding='latin-1')
df.head()

Unnamed: 0,id,question,category,type
0,0,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,1,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,2,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,3,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,5,Which is the hierarchical BrainInfo ID of the ...,literal,[string]


#### Eliminamos las columnas que no necesitamos

In [3]:
unrelevant_features = ["id","type"]
df.drop(unrelevant_features,inplace=True,axis=1)
df.head()

Unnamed: 0,question,category
0,Was Jacqueline Kennedy Onassis a follower of M...,boolean
1,What is the name of the opera based on Twelfth...,resource
2,When did Lena Horne receive the Grammy Award f...,literal
3,Do Prince Harry and Prince William have the sa...,boolean
4,Which is the hierarchical BrainInfo ID of the ...,literal


#### Cantidad por categoria

In [4]:
df["category"].value_counts()

resource    30226
literal      4217
boolean      2227
Name: category, dtype: int64

#### Cambiamos las categorías por valores enteros

In [5]:
import warnings as wrn
wrn.filterwarnings('ignore')

boolean = df[(df["category"] == "boolean")]
boolean["category"] = 0 

literal = df[(df["category"] == "literal")]
literal["category"] = 1

resource = df[(df["category"] == "resource")]
resource["category"] = 2

data = pd.concat([boolean, literal, resource],axis=0)

data.reset_index(inplace=True)
data.drop('index', axis=1, inplace=True)
data.head()

Unnamed: 0,question,category
0,Was Jacqueline Kennedy Onassis a follower of M...,0
1,Do Prince Harry and Prince William have the sa...,0
2,Did Buddhism was named after the immigration o...,0
3,Did Steve Sampson manage a club of Santa Clara...,0
4,Is the number of injured in the Newhall massac...,0


#### Verificamos nulos

In [6]:
# número de nulos por columa
nulls = data.isnull().sum()

# verificamos la cantidad
nulls[0:2]

question    0
category    0
dtype: int64

## Data Preprocessing

In [7]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

cleanedData = []

lemma = WordNetLemmatizer()
swords = stopwords.words("english")
for text in data["question"]:
       
    # Tokenizing and lemmatizing
    text = nltk.word_tokenize(text.lower())
    text = [lemma.lemmatize(word) for word in text]
    
    # Removing stopwords
    text = [word for word in text if word not in swords]
    
    # Joining
    text = " ".join(text)
    
    cleanedData.append(text)

[nltk_data] Downloading package punkt to /home/mariano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mariano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mariano/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#### Bolsa de palabras

In [8]:
vectorizer = CountVectorizer(max_features=10000)
BOW = vectorizer.fit_transform(cleanedData)

#### Dividimos en entrenamiento y testeo (70-30)

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(BOW,np.asarray(data["category"]), test_size = 0.30)

#### Support Vector Machine Classifier Modeling

In [10]:
from sklearn.svm import SVC
start_time = time.time()

model = SVC()
model.fit(x_train,y_train)

end_time = time.time()
process_time = round(end_time-start_time,2)
print("Fitting SVC took {} seconds".format(process_time))

Fitting SVC took 45.08 seconds


In [11]:
predictions = model.predict(x_test)

In [12]:
from sklearn.metrics import accuracy_score
print("Accuracy of model is {}%".format(accuracy_score(y_test,predictions) * 100))

Accuracy of model is 92.05526770293609%
