# Text classification with FastText
## 1. Importing libraries and dataset

In [11]:
import fasttext as ft
import pandas as pd
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from lib.kinya_norm import normalize_text, strip_html_tags
from lib.helpers import getStopwordsFromFile

In [12]:
df  = pd.read_csv('../Data/amakuru.csv')

In [13]:
df.shape

(3182, 4)

## 2.  Clean this dataset
1. Remove Repeated Rows
2. Remove Null Rows
2. Remove articles with empty text

In [14]:
df = df.drop_duplicates(subset="Title", keep="first")
df = df.dropna(subset=['Category'])
df = df.sort_values('Title')
df.shape

(1888, 4)

In [16]:
df['Length'] = df.Body.str.len()

In [17]:
df.head()

Unnamed: 0,Title,Body,Url,Category,Length
2667,\n (Video) Abakobwa 20 babwiwe kwiyiz...,Abakobwa bitabiriye irushanwa rya Miss Rwanda ...,https://umuseke.rw/abakobwa-20-babwiwe-kwiyize...,Entertainment,1992
860,\n 2019-Q4: Ibiciro by’ibiribwa byazam...,Banki Nkuru y’u Rwanda (BNR) iratangaza ko mu ...,https://umuseke.rw/2019-q4-ibiciro-byibiribwa-...,Business,3997
1051,\n AMAFOTO: Seifu wavunikiye mu mukino...,"Umukinnyi wa APR FC, Niyonzima Olivier Seifu w...",https://umuseke.rw/amafoto-seifu-wavunikiye-mu...,Entertainment,1555
1056,\n AMAGARE: HABIMANA na Ingabire beguk...,Ku Cyumweru tariki 26 Mutarama 2020 Hakinwe Ir...,https://umuseke.rw/amagare-habimana-na-ingabir...,Entertainment,2187
995,\n AMAGARE: Ride Rwanda igiye gukinwa ...,Isiganwa ku magare ku bakinnyi batabigize umwu...,https://umuseke.rw/amagare-ride-rwanda-igiye-g...,Entertainment,1351


In [18]:
df = df[(df.Length > 50)]

In [19]:
strip_nl = lambda x: x.replace('\n', '')

In [20]:
df.shape
df.groupby(['Category']).count()

Unnamed: 0_level_0,Title,Body,Url,Length
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Business,276,276,276,276
Culture,84,84,84,84
Diaspora,48,48,48,48
Education,299,299,299,299
Entertainment,267,267,267,267
Environment,48,48,48,48
Gospel,63,63,63,63
Health,223,223,223,223
Opinion,6,6,6,6
Politics,248,248,248,248


## 3. Preprocess text
1. Getting body and category columns
2. Removing new lines in article body
3. Preprocessing article body and removing stopwords

In [21]:
stopwords = getStopwordsFromFile("stopwords_idfvalues.txt")

In [22]:
df_ft = df[['Body', 'Category']]
df_ft['Body'] = df_ft['Body'].apply(lambda x: x.replace('\n', ''))
df_ft['Body'] = df_ft['Body'].apply(lambda x: normalize_text(x, stopwords_removal=True, stopwords=stopwords))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
# df_ft['Body'][2]
df_ft.shape

(1881, 2)

## 4. Label and save articles to .txt file
1. Create dictionary of article body and category
2. Format the data per fasttext requirements
3. save each article to a text file

In [24]:
articles_list = list(zip(df_ft.Body, df_ft.Category))

In [25]:
labeled_articles = []
for ar in articles_list:
    text, category = ar
    line = '__label__' + str(category) + ' ' + str(strip_html_tags(text))
    labeled_articles.append(line)

In [26]:
# type(labeled_articles)
# labeled_articles = labeled_articles.sort(key=len)
labeled_articles.sort(key=len)

In [27]:
with open('ft_dst_0.txt', 'w') as f:
    for item in labeled_articles:
        f.write("%s\n" % item)

## 5 Split data into training and validation data

In [28]:
!wc ft_dst_0.txt

    1881  444353 3658241 ft_dst_0.txt


In [183]:
!head -n 1681 ft_dst_0.txt > ft_dst_0.train
!tail -n 200 ft_dst_0.txt > ft_dst_0.valid

## 6 Parameter tuning

In [30]:
# model = ft.train_supervised(input="ft_dst.train", autotuneValidationFile='ft_dst.valid')
# model = ft.train_supervised(input="ft_dst.train", autotuneValidationFile='ft_dst.valid')
# model = ft.train_supervised(input="ft_dst.train")
# model = ft.train_supervised(input="ft_dst_0.train", epoch=35, lr=1)
# model = ft.train_supervised(input='ft_dst_0.train', autotuneValidationFile='ft_dst_0.valid')
model = ft.train_supervised(input='ft_dst_0.train', epoch=25, lr=1)

## 7 Testing model on validation data

In [31]:
model.test("ft_dst_0.valid")

(200, 0.78, 0.78)

## 8 Predicting on some articles

In [36]:
model.predict("Umudepite akaba n’Umuyobozi w’ishyaka Envol muri Repubulika Iharanira Demokarasi ya Congo, Delly Sesanga, yamaganye igitekerezo cy’umunyapolitiki Adolphe Muzito uherutse gusaba ko igihugu cye gitera u Rwanda rukomekwa kuri RDC kugira ngo amahoro aboneke mu Burasirazuba.", k=3)

(('__label__Education', '__label__Tech', '__label__Politics'),
 array([0.60282046, 0.22759387, 0.13945426]))

In [37]:
test_article = """
Nibwo bwa mbere mu Rwanda habonetse abanduye Coronavirus barenga 20 mu bipimo by’umunsi umwe nyuma y’aho kuri uyu wa Gatanu tariki ya 24 Mata 2020 Minisante itangarije ko habonetse abantu 22 bashya banduye Coronavirus mu bipimo 1,046 byafashwe. Ibi bipimo kandi akaba atari byo byinshi bipimwe umunsi umwe kuva iyi ndwara yagera mu Rwanda.
"""
test_article = normalize_text(test_article)

In [38]:
model.predict(test_article.replace('\n', ''), k=3)

(('__label__Health', '__label__Sport', '__label__Gospel'),
 array([0.78252685, 0.20872971, 0.00677439]))

model.labels

In [39]:
model.labels

['__label__Education',
 '__label__Entertainment',
 '__label__Business',
 '__label__Tech',
 '__label__Politics',
 '__label__Health',
 '__label__Culture',
 '__label__Gospel',
 '__label__Diaspora',
 '__label__Tourism',
 '__label__Environment',
 '__label__Sport',
 '__label__Opinion']