In [37]:
import pandas as pd
import csv
from time import time

from fasttext import supervised 

In [38]:
#Download dbpedia dataset from: https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz
data_path = '/home/bangaru/Downloads/NLPBookTut/'

# Loading train data
train_file = data_path + 'dbpedia_csv/train.csv'
df = pd.read_csv(train_file, header=None, names=['class','name','description'])

# Loading test data
test_file = data_path + 'dbpedia_csv/test.csv'
df_test = pd.read_csv(test_file, header=None, names=['class','name','description'])

# Data with us
print("Train:{} Test:{}".format(df.shape,df_test.shape))


Train:(560000, 3) Test:(70000, 3)


In [39]:
# Since we have no clue about the classes lets build one
# Mapping from class number to class name
class_dict={
            1:'Company',
            2:'EducationalInstitution',
            3:'Artist',
            4:'Athlete',
            5:'OfficeHolder',
            6:'MeanOfTransportation',
            7:'Building',
            8:'NaturalPlace',
            9:'Village',
            10:'Animal',
            11:'Plant',
            12:'Album',
            13:'Film',
            14:'WrittenWork'
        }

# Mapping the classes
df['class_name'] = df['class'].map(class_dict)
df.head()

Unnamed: 0,class,name,description,class_name
0,1,E. D. Abbott Ltd,Abbott of Farnham E D Abbott Limited was a Br...,Company
1,1,Schwan-Stabilo,Schwan-STABILO is a German maker of pens for ...,Company
2,1,Q-workshop,Q-workshop is a Polish company located in Poz...,Company
3,1,Marvell Software Solutions Israel,Marvell Software Solutions Israel known as RA...,Company
4,1,Bergan Mercy Medical Center,Bergan Mercy Medical Center is a hospital loc...,Company


In [40]:
df["class_name"].value_counts()

NaturalPlace              40000
Athlete                   40000
MeanOfTransportation      40000
Company                   40000
Animal                    40000
Album                     40000
Village                   40000
Plant                     40000
WrittenWork               40000
Film                      40000
OfficeHolder              40000
Building                  40000
EducationalInstitution    40000
Artist                    40000
Name: class_name, dtype: int64

In [41]:
# Lets do some cleaning
def clean_it(text,normalize=True):
    # Replacing possible issues with data. We can add or reduce the replacemtent in this chain
    s = str(text).replace(',',' ').replace('"','').replace('\'',' \' ').replace('.',' . ').replace('(',' ( ').\
            replace(')',' ) ').replace('!',' ! ').replace('?',' ? ').replace(':',' ').replace(';',' ').lower()
    
    # normalizing / encoding the text
    if normalize:
        s = s.normalize('NFKD').str.encode('ascii','ignore').str.decode('utf-8')
    
    return s

# Now lets define a small function where we can use above cleaning on datasets
def clean_df(data, cleanit= False, shuffleit=False, encodeit=False, label_prefix='__class__'):
    # Defining the new data
    df = data[['name','description']].copy(deep=True)
    df['class'] = label_prefix + data['class'].astype(str) + ' '
    
    # cleaning it
    if cleanit:
        df['name'] = df['name'].apply(lambda x: clean_it(x,encodeit))
        df['description'] = df['description'].apply(lambda x: clean_it(x,encodeit))
    
    # shuffling it
    if shuffleit:
        df.sample(frac=1).reset_index(drop=True)
        
    # for fastext to understand data better
 #   df['name'] = ' ' + df['name'] + ' '
  #  df['description'] = ' ' + df['description'] + ' '
        
    return df

In [42]:
%%time
# Transform datasets
df_train_cleaned = clean_df(df, True, True)
df_test_cleaned = clean_df(df_test, True, True)

CPU times: user 4.79 s, sys: 79.9 ms, total: 4.87 s
Wall time: 4.88 s


In [43]:
train_cats = df_train_cleaned['class']
test_cats = df_test_cleaned['class']


In [44]:
# Write files to disk
train_file = data_path + 'dbpedia_train.csv'
df_train_cleaned.to_csv(train_file, header=None, index=False, columns=['class','name','description'] )

test_file = data_path + 'dbpedia_test.csv'
df_test_cleaned.to_csv(test_file, header=None, index=False, columns=['class','name','description'] )


In [45]:
## Using fastText for feature extraction and training

In [48]:

%time model = supervised(train_file, 'temp', label_prefix="__class__")

# also small function to see evaluated results.
def print_results(N, p, r):
    print("N\t" + str(N))
    print("Precision {}\t{:.3f}".format(1, p))
    print("Recall    {}\t{:.3f}".format(1, r))
    
results = model.test(test_file)


CPU times: user 59.5 s, sys: 1.51 s, total: 1min
Wall time: 14.1 s


In [47]:
print_results(results.nexamples, results.precision, results.recall)

N	70000
Precision 1	0.965
Recall    1	0.965
