Text classification dataset DBPedia. The dataset consists of text descriptions of 14 different classes. The training set contains 560,000 reviews and the test contains 70,000. Idea is to build a model that can predict the class from description.


Download dataset from [here](https://drive.google.com/drive/folders/0Bz8a_Dbh9Qhbfll6bVpmNUtUcFdjYmF2SEpmZUZUcVNiMUw1TWN6RDV3a0JHT3kxLVhVR2M). 



In [2]:
import os,sys  
import pandas as pd
import numpy as np
import fasttext

In [9]:
# Set dataset path

data_path = './dbpedia_csv/'

#Load train set
train_file = data_path + 'dbpedia_train.csv'
df = pd.read_csv(train_file, header=None, names=['class','name','description'])

#Load test set
test_file = data_path + 'dbpedia_test.csv'
df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])

#Mapping from class number to class name
class_dict={
1:'Company',
2:'EducationalInstitution',
3:'Artist',
4:'Athlete',
5:'OfficeHolder',
6:'MeanOfTransportation',
7:'Building',
8:'NaturalPlace',
9:'Village',
10:'Animal',
11:'Plant',
12:'Album',
13:'Film',
14:'WrittenWork'
}
df['class_name'] = df['class'].map(class_dict)
df.head()
df.tail()

Unnamed: 0,class,name,description,class_name
559995,14,Barking in Essex,Barking in Essex is a Black comedy play direc...,WrittenWork
559996,14,Science & Spirit,Science & Spirit is a discontinued American b...,WrittenWork
559997,14,The Blithedale Romance,The Blithedale Romance (1852) is Nathaniel Ha...,WrittenWork
559998,14,Razadarit Ayedawbon,Razadarit Ayedawbon (Burmese: ရာဇာဓိရာဇ် အရေး...,WrittenWork
559999,14,The Vinyl Cafe Notebooks,Vinyl Cafe Notebooks: a collection of essays ...,WrittenWork


In [10]:
df.head()


Unnamed: 0,class,name,description,class_name
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...,Company
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...,Company
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...,Company
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...,Company
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...,Company


In [11]:

df.tail()

Unnamed: 0,class,name,description,class_name
559995,14,Barking in Essex,Barking in Essex is a Black comedy play direc...,WrittenWork
559996,14,Science & Spirit,Science & Spirit is a discontinued American b...,WrittenWork
559997,14,The Blithedale Romance,The Blithedale Romance (1852) is Nathaniel Ha...,WrittenWork
559998,14,Razadarit Ayedawbon,Razadarit Ayedawbon (Burmese: ရာဇာဓိရာဇ် အရေး...,WrittenWork
559999,14,The Vinyl Cafe Notebooks,Vinyl Cafe Notebooks: a collection of essays ...,WrittenWork


In [20]:
#df.describe().transpose()
desc = df.groupby('class')
desc.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,class_name,description,name
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,count,40000,40000,40000
1,unique,1,39996,40000
1,top,Company,MegaPath Corporation—headquartered in Pleasan...,JMY Records
1,freq,40000,2,1
2,count,40000,40000,40000
2,unique,1,39992,40000
2,top,EducationalInstitution,Dr. Meghnad Saha College is a college in Itah...,RGS Springfield
2,freq,40000,2,1
3,count,40000,40000,40000
3,unique,1,40000,40000


In [5]:
def clean_dataset(dataframe, shuffle=False, encode_ascii=False, clean_strings = False, label_prefix='__label__'):
    # Transform train file
    df = dataframe[['name','description']].apply(lambda x: x.str.replace(',',' '))
    df['class'] = label_prefix + dataframe['class'].astype(str) + ' '
    if clean_strings:
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('"',''))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('\'',' \' '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('.',' . '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('(',' ( '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace(')',' ) '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('!',' ! '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace('?',' ? '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace(':',' '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.replace(';',' '))
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.lower())
    if shuffle:
        df.sample(frac=1).reset_index(drop=True)
    if encode_ascii :
        df[['name','description']] = df[['name','description']].apply(lambda x: x.str.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8'))
    df['name'] = ' ' + df['name'] + ' '
    df['description'] = ' ' + df['description'] + ' '
    return df

In [6]:
%%time
# Transform datasets
df_train_clean = clean_dataset(df, True, False)
df_test_clean = clean_dataset(df_test, False, False)

# Write files to disk
train_file_clean = data_path + 'dbpedia.train'
df_train_clean.to_csv(train_file_clean, header=None, index=False, columns=['class','name','description'] )

test_file_clean = data_path + 'dbpedia.test'
df_test_clean.to_csv(test_file_clean, header=None, index=False, columns=['class','name','description'] )

CPU times: user 6.89 s, sys: 1.94 s, total: 8.84 s
Wall time: 11.1 s


In [13]:
df_train_clean.head()

Unnamed: 0,name,description,class
0,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a B...,__label__1
1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for...,__label__1
2,Q-workshop,Q-workshop is a Polish company located in Po...,__label__1
3,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as R...,__label__1
4,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital lo...,__label__1


In [16]:
df_train_clean.tail()

Unnamed: 0,name,description,class
559995,Barking in Essex,Barking in Essex is a Black comedy play dire...,__label__14
559996,Science & Spirit,Science & Spirit is a discontinued American ...,__label__14
559997,The Blithedale Romance,The Blithedale Romance (1852) is Nathaniel H...,__label__14
559998,Razadarit Ayedawbon,Razadarit Ayedawbon (Burmese: ရာဇာဓိရာဇ် အရေ...,__label__14
559999,The Vinyl Cafe Notebooks,Vinyl Cafe Notebooks: a collection of essays...,__label__14


In [21]:
df['description'][10]

' Angstrem Group (Russian: \xd0\x9e\xd0\x90\xd0\x9e \xc2\xab\xd0\x90\xd0\xbd\xd0\xb3\xd1\x81\xd1\x82\xd1\x80\xd0\xb5\xd0\xbc\xc2\xbb named after angstrom) is a group of Russian companies one of the largest manufacturers of integrated circuits in Eastern Europe.The group includes: OAO Angstrem (the parent company design and manufacturing of electronic products and semiconductors); OAO Angstrem-M (custom design of integrated circuits staff training); OAO Angstrem-T (under-construction plant with 130-90 nm topology); OAO Angstrem-2M NGO Angstrem OAO Antek\xe2\x86\x91'

In [22]:
df_train_clean['description'][10]

'  Angstrem Group (Russian: \xd0\x9e\xd0\x90\xd0\x9e \xc2\xab\xd0\x90\xd0\xbd\xd0\xb3\xd1\x81\xd1\x82\xd1\x80\xd0\xb5\xd0\xbc\xc2\xbb named after angstrom) is a group of Russian companies one of the largest manufacturers of integrated circuits in Eastern Europe.The group includes: OAO Angstrem (the parent company design and manufacturing of electronic products and semiconductors); OAO Angstrem-M (custom design of integrated circuits staff training); OAO Angstrem-T (under-construction plant with 130-90 nm topology); OAO Angstrem-2M NGO Angstrem OAO Antek\xe2\x86\x91 '

In [17]:
%%time
# Train a classifier
output_file = data_path + 'dp_model'
classifier = fasttext.supervised(train_file_clean, output_file, label_prefix='__label__')

CPU times: user 2min 51s, sys: 8.27 s, total: 2min 59s
Wall time: 1min 8s


In [18]:
%%time
# Evaluate classifier
result = classifier.test(test_file_clean)
print('P@1:', result.precision)
print('R@1:', result.recall)
print ('Number of examples:', result.nexamples)

('P@1:', 0.9795142857142857)
('R@1:', 0.9795142857142857)
('Number of examples:', 70000)
CPU times: user 1.87 s, sys: 448 ms, total: 2.32 s
Wall time: 2.51 s
