# Tagging - TFIDF + NB

In [1]:
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [2]:
# Bring in data
with open("clean_trainig_data.pkl", "rb") as picklefile:
    df = pickle.load(picklefile)

In [3]:
# Assign doc id
df = df.assign(doc_id=[0 + i for i in xrange(len(df))])

In [4]:
df.head()

Unnamed: 0,authors,date,image,keywords,summary,text,url,valid,decode_text,error,Tag,language,doc_id
0,,,,,,United Nations S/2015/302 Security Council...,http://www.securitycouncilreport.org/atf/cf/%7...,,United Nations S/2015/302 Security Council...,all good,Conflict and violence,en,0
1,[Heather Saul],2013-08-05 12:33:51+01:00,https://static.independent.co.uk/s3fs-public/t...,"[pakistan, homes, remote, afghanistan, floodin...",Flash flooding across Afghanistan and Pakistan...,Flash flooding across Afghanistan and Pakistan...,http://www.independent.co.uk/news/world/asia/1...,True,Flash flooding across Afghanistan and Pakistan...,all good,Disasters,en,1
2,,,,,,www.unocha.org The mission of the United Natio...,http://reliefweb.int/sites/reliefweb.int/files...,,www.unocha.org The mission of the United Natio...,all good,Disasters,en,2
3,[],2015-05-11 09:15:05+00:00,http://floodlist.com/wp-content/uploads/2015/0...,"[baghlan, afghanistan, agency, province, distr...",Flash floods struck on 08 May 2015 in Faryab P...,"Afghanistan state news agency, Bakhtar News Ag...",http://floodlist.com/asia/afghanistan-flash-fl...,True,"Afghanistan state news agency, Bakhtar News Ag...",all good,Disasters,en,3
4,[],2015-07-27 11:42:21+00:00,http://floodlist.com/wp-content/uploads/2015/0...,"[province, official, hit, district, floods, fl...",According to reports from Chinese state news a...,Flash floods have struck once again in the Bad...,http://floodlist.com/asia/afghanistan-6-dead-f...,True,Flash floods have struck once again in the Bad...,all good,Disasters,en,4


In [5]:
df.shape

(494, 13)

## Dataframe for Tagging

In [6]:
documents = df[['doc_id', 'Tag', 'decode_text']]
documents = documents.rename(columns = {'Tag':'tag'})

In [9]:
documents.head()

Unnamed: 0,doc_id,tag,decode_text
0,0,Conflict and violence,United Nations S/2015/302 Security Council...
1,1,Disasters,Flash flooding across Afghanistan and Pakistan...
2,2,Disasters,www.unocha.org The mission of the United Natio...
3,3,Disasters,"Afghanistan state news agency, Bakhtar News Ag..."
4,4,Disasters,Flash floods have struck once again in the Bad...


In [10]:
# Split dataframe
d_train, d_test = train_test_split(documents, test_size = 0.2)

In [11]:
d_train.shape

(395, 3)

In [12]:
d_test.shape

(99, 3)

In [15]:
# Prep train tags for TFIDF
train_tags = d_train.tag
train_tags.head()

587                Disasters
549                Disasters
440    Conflict and violence
151                Disasters
431                Disasters
Name: tag, dtype: object

In [16]:
train_tags.shape

(395,)

In [17]:
train_tags.value_counts()

Disasters                339
Conflict and violence     56
Name: tag, dtype: int64

In [19]:
# Prep test tags
test_tags = d_test.tag
test_tags.head()

13                 Disasters
181    Conflict and violence
185                Disasters
157                Disasters
536                Disasters
Name: tag, dtype: object

In [20]:
test_tags.shape

(99,)

In [21]:
test_tags.value_counts()

Disasters                85
Conflict and violence    14
Name: tag, dtype: int64

## TFIDF

In [24]:
vectorizer = TfidfVectorizer(stop_words="english")
train_vectors = vectorizer.fit_transform(d_train.decode_text)
train_vectors.shape

(395, 40445)

In [26]:
test_vectors = vectorizer.transform(d_test.decode_text)
test_vectors.shape

(99, 40445)

## Bernoulli NB

In [28]:
document_model = BernoulliNB().fit(train_vectors, train_tags)
document_model.predict(test_vectors)

array(['Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Conflict and violence', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Conflict and violence', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Conflict and violence', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disaste

In [31]:
test_pred = document_model.predict(test_vectors)
test_pred_series = pd.Series(test_pred)
test_pred_series.head()

0    Disasters
1    Disasters
2    Disasters
3    Disasters
4    Disasters
dtype: object

In [33]:
confusion_matrix(test_tags, test_pred, labels=["Conflict and violence", "Disasters"])

array([[ 3, 11],
       [ 1, 84]])

In [34]:
document_model.score(test_vectors, test_tags)

0.87878787878787878

In [36]:
target_names = ["Conflict and violence", "Disasters"]
print(classification_report(test_tags, test_pred, target_names=target_names))

                       precision    recall  f1-score   support

Conflict and violence       0.75      0.21      0.33        14
            Disasters       0.88      0.99      0.93        85

          avg / total       0.87      0.88      0.85        99



## Random Forest

In [39]:
document_RF = RandomForestClassifier().fit(train_vectors, train_tags)
document_RF.predict(test_vectors)

array(['Disasters', 'Conflict and violence', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Conflict and violence', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Conflict and violence', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Conflict and violence', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Conflict and violence', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Conflict and violence',
       'Disasters', 'Disasters', 'Conflict and violence', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disa

In [41]:
test_pred_RF = document_RF.predict(test_vectors)
document_RF.score(test_vectors, test_tags)

0.93939393939393945

In [42]:
confusion_matrix(test_tags, test_pred_RF, labels=["Conflict and violence", "Disasters"])

array([[ 8,  6],
       [ 0, 85]])

In [43]:
print(classification_report(test_tags, test_pred_RF, target_names=target_names))

                       precision    recall  f1-score   support

Conflict and violence       1.00      0.57      0.73        14
            Disasters       0.93      1.00      0.97        85

          avg / total       0.94      0.94      0.93        99



## Oversample

In [53]:
X_train = train_vectors.toarray()
Y_train = train_tags

sm = SMOTE(kind='regular')
X_train_over, Y_train_over = sm.fit_sample(X_train, Y_train)

In [55]:
document_RF_oversample = RandomForestClassifier().fit(X_train_over, Y_train_over)
document_RF_oversample.predict(test_vectors)

array(['Disasters', 'Conflict and violence', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Conflict and violence', 'Conflict and violence', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Conflict and violence', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Conflict and violence', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Conflict and violence', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Conflict and violence',
       'Disasters', 'Disasters', 'Conflict and violence', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',


In [57]:
test_pred_RF_oversample = document_RF_oversample.predict(test_vectors)
document_RF_oversample.score(test_vectors, test_tags)

0.90909090909090906

In [58]:
confusion_matrix(test_tags, test_pred_RF_oversample, labels=["Conflict and violence", "Disasters"])

array([[ 8,  6],
       [ 3, 82]])

In [59]:
print(classification_report(test_tags, test_pred_RF_oversample, target_names=target_names))

                       precision    recall  f1-score   support

Conflict and violence       0.73      0.57      0.64        14
            Disasters       0.93      0.96      0.95        85

          avg / total       0.90      0.91      0.90        99



## Undersample

In [60]:
nm3 = NearMiss(version=3)
X_train_under, Y_train_under = nm3.fit_sample(X_train, Y_train)

In [62]:
document_RF_undersample = RandomForestClassifier().fit(X_train_under, Y_train_under)
document_RF_undersample.predict(test_vectors)

array(['Disasters', 'Conflict and violence', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Conflict and violence',
       'Disasters', 'Disasters', 'Disasters', 'Conflict and violence',
       'Disasters', 'Disasters', 'Conflict and violence',
       'Conflict and violence', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Conflict and violence', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Conflict and violence', 'Disasters',
       'Conflict and violence', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Conflict and violence', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Disasters', 'Disasters',
       'Disasters', 'Disasters', 'Disasters', 'Conflict and violence',
       'Disasters', 'Disasters', 'Conflict and violence', 'Disasters',
       'Disasters', 'Disaster

In [64]:
test_pred_RF_undersample = document_RF_undersample.predict(test_vectors)
document_RF_undersample.score(test_vectors, test_tags)

0.84848484848484851

In [65]:
confusion_matrix(test_tags, test_pred_RF_undersample, labels=["Conflict and violence", "Disasters"])

array([[ 9,  5],
       [10, 75]])

In [66]:
print(classification_report(test_tags, test_pred_RF_undersample, target_names=target_names))

                       precision    recall  f1-score   support

Conflict and violence       0.47      0.64      0.55        14
            Disasters       0.94      0.88      0.91        85

          avg / total       0.87      0.85      0.86        99



In [89]:
savemodel1 = pickle.dumps(document_RF_oversample)

In [90]:
from sklearn.externals import joblib
joblib.dump(document_RF_oversample, 'RF_model.pkl') 

['RF_model.pkl']

In [101]:
from sklearn.externals import joblib
joblib.dump(document_RF, 'RF_model_secondtry.pkl') 

['RF_model_secondtry.pkl']