### Article Detection Model

In [425]:
import pandas as pd

import numpy as np

import nltk
nltk.download('stopwords') 
nltk.download('wordnet')
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import string

from sklearn import model_selection, preprocessing, metrics, svm, ensemble, linear_model, svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline, make_pipeline

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.model_selection import GridSearchCV


[nltk_data] Downloading package stopwords to /home/mazz76/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mazz76/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [301]:
#Importing data

news_articles = pd.read_csv('articles_with_targets.csv')
news_articles.head()

for x in news_articles['text']:
    print(x)
    break

Europe must resign itself to a long-term terror threat - Business Insider


In [302]:
#I want just the headline. I need to remove the publication info that comes after the hyphen

expanded_df = news_articles['text'].str.split(' - ', expand=True)

headline_only = expanded_df[expanded_df.columns[0]]

news_articles['text'] = headline_only


In [303]:
#The dataset is extremely imbalanced, with only 73 examples of articles in which a local/elected official resigned
#because of a threat. There are more than 4,000 articles that do not fall into this category. 

news_articles['target'].value_counts()

#I will address the imbalanced data later in the notebook. For now, I will start with preprocessing the text.

0    4259
1      73
Name: target, dtype: int64

In [223]:
news_articles.shape

(4332, 2)

In [224]:
#First I will make a training (65%) and a testing set (35%).

news_articles_negative = news_articles[news_articles['target']==0]
news_articles_positive = news_articles[news_articles['target']==1]

#Splitting each into 65%/35%
a, b = np.split(news_articles_negative, [int(.65*len(news_articles_negative))])
c, d = np.split(news_articles_positive, [int(.65*len(news_articles_positive))])

#Making the training and testing datasets
train = pd.concat([a, c], axis=0)
test = pd.concat([b, d], axis=0)

In [225]:
train['target'].value_counts()

0    2768
1      47
Name: target, dtype: int64

In [226]:
test['target'].value_counts()

0    1491
1      26
Name: target, dtype: int64

In [None]:
#In both the training and testing datasets, only about 2% of the samples are cases in which there was a threat that
#led to a resignation

In [304]:
#Functions to clean the text

def de_contraction(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

lemmatizer = WordNetLemmatizer()

def clean_text(text):

    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    #Remove numbers
    text = re.sub(r'\d+', '', text)
    #Remove station call letters
    text = re.sub(r'[A-Z]{4}', '', text)
    text = re.sub(r'[A-Z]{3}', '', text)
    #Get rid of contractions
    text=de_contraction(text)
    #Lowercase
    text=text.lower()
    #Tokenize texts
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)

    texts_clean = []
    for word in tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation+'...'):  # remove punctuation
            # 
            stem_word = lemmatizer.lemmatize(word,"v")  # lemmatize word
            texts_clean.append(stem_word)

    return " ".join(texts_clean)

In [116]:
#Cleaning the text

train["text"]= train["text"].apply(clean_text)
train = train.sample(frac=1).reset_index(drop=True)
train.head()

Unnamed: 0,text,target
0,school report card entire school board resign ...,0
1,update big change turlock city council hold cl...,0
2,city hall shocker sonoma mayor resign sonoma s...,0
3,boise school board trustee resign cite profess...,0
4,ledyard school board member resign bring total...,0


In [150]:
for x in train['text']:
    print(x)

school report card entire school board resign hot mic incident l parent host zoom blackout slow reopen yahoo lifestyle
update big change turlock city council hold close door meet modesto bee
city hall shocker sonoma mayor resign sonoma sun sonoma ca sonoma valley sun
boise school board trustee resign cite professional responsibilites idaho press
ledyard school board member resign bring total vacant seat three theday com
op ed resign immigration judge los angeles time
andy dance resign school board seat hold since run county commission flaglerlive com
john rich call nashville mayor resign amid email controversy iheart
norristown school board president resign report send suggestive message teenage girl philadelphia inquirer
social media vitriol lead ignace mayor resign tbnewswatch com
commissioner quit panel amid news facebook group los angeles time
monk resign school board news corsicanadailysun com corsicana daily sun
embattle hackensack school board member resign new jersey globe new 

crestline mayor allen moore say plan resign end week racial slur video telegraph forum
crabtree resign visalia unify school board friday last day visalia time delta tulare advance register
caddo commissioner quit
snow hill mayor resign name interim town manager delmarva
mount pleasant mayor resign one open city commission vacancy central michigan life
long time caddo commissioner resign news radio
employee resign controversial social media post com
benjamin resign rep removal threats antigua observer
conejo valley unify school board president resign end school year vc star
pantego mayor resign years role dallas fort worth


In [117]:
X = train['text']
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [118]:
#word level tf-idf
vectorizer = TfidfVectorizer(min_df=3,analyzer='word',max_features=3000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)

In [119]:
rf=RandomForestClassifier(n_estimators=10)
rf.fit(X_train_vectorized,y_train)

RandomForestClassifier(n_estimators=10)

In [120]:
y_train_pred=rf.predict(X_train_vectorized)

print(confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[2215    0]
 [   5   32]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2215
           1       1.00      0.86      0.93        37

    accuracy                           1.00      2252
   macro avg       1.00      0.93      0.96      2252
weighted avg       1.00      1.00      1.00      2252

0.9977797513321492


In [121]:
y_pred=rf.predict(X_test_vectorized)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[553   0]
 [ 10   0]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       553
           1       0.00      0.00      0.00        10

    accuracy                           0.98       563
   macro avg       0.49      0.50      0.50       563
weighted avg       0.96      0.98      0.97       563

0.9822380106571936


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Precision and recall are not good:

#Precision: Out of all the headlines that the model predicted were threat articles with resignations, 0% actually were.
#Recall: Out of all the headlines that actually were articles with resignations, the model predicted 0% of them.

#Rebalancing will be very important

In [15]:
#I will try to augment the minority class by adding synthetic headlines that rely on synonyms of actual headlines

import sys
!{sys.executable} -m pip install nlpaug



In [16]:
import nlpaug.augmenter.word as naw

In [170]:
#Going back to the original dataset in which there were 73 1s and 4,259 0s. 

news_articles["text"]= news_articles["text"].apply(clean_text)
news_articles = news_articles.sample(frac=1).reset_index(drop=True)
news_articles.head()

X = news_articles['text']
y = news_articles['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#I want no more than 3 synonymns added in each original headline
aug = naw.SynonymAug(aug_src='wordnet',aug_max=3, stopwords=stopwords.words('english'))

#I will add 40 new headlines for each one existing
augmented_sentences=[]
augmented_sentences_labels=[]
temps=[]
for i in X_train.index:
    if y_train[i]==1:
        temps=aug.augment(X_train[i],n=40)
    for sent in temps:
        augmented_sentences.append(sent)
        augmented_sentences_labels.append(1)

In [171]:
augmented_sentences

['gay oklahoma mayor resign 2 month cite safety device concern',
 'gay oklahoma mayor relinquish 2 months cite safety business organisation',
 'gay oklahoma mayor resign deuce calendar month cite safety concern',
 'gay sooner state mayor resign two months cite safety business concern',
 'gay ok mayor resign two months cite safety concern',
 'homosexual ok mayor resign two months cite safety business organisation',
 'gay sooner state mayor resign two month cite safety concern',
 'homophile sooner state mayor resign two months cite safety worry',
 'homosexual oklahoma mayor leave office two month cite safety concern',
 'homo sooner state mayor step down two months cite safety concern',
 'gay oklahoma mayor submit two month cite safety concern',
 'gay sooner state mayor resign two months cite safety concern',
 'gay oklahoma mayor resign 2 month cite safety device concern',
 'gay oklahoma mayor resign two month cite safety business concern',
 'gay oklahoma mayor resign deuce month cite saf

In [172]:
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [173]:
vectorizer = TfidfVectorizer(min_df=3,analyzer='word',max_features=3000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)
classifier=ensemble.ExtraTreesClassifier(n_estimators=300)

classifier.fit(X_train_vectorized,y_train)

ExtraTreesClassifier(n_estimators=300)

In [174]:
y_train_pred=classifier.predict(X_train_vectorized)

print(confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[  3410      0]
 [     0 136895]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3410
           1       1.00      1.00      1.00    136895

    accuracy                           1.00    140305
   macro avg       1.00      1.00      1.00    140305
weighted avg       1.00      1.00      1.00    140305

1.0


In [175]:
y_pred=classifier.predict(X_test_vectorized)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[842   7]
 [ 15   3]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       849
           1       0.30      0.17      0.21        18

    accuracy                           0.97       867
   macro avg       0.64      0.58      0.60       867
weighted avg       0.97      0.97      0.97       867

0.9746251441753172


In [438]:
#Trying an SVM model

news_articles["text"]= news_articles["text"].apply(clean_text)
news_articles = news_articles.sample(frac=1).reset_index(drop=True)
news_articles.head()

X = news_articles['text']
y = news_articles['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#I want no more than 2 synonymns added in each original headline
aug = naw.SynonymAug(aug_src='wordnet',aug_max=2, stopwords=stopwords.words('english'))

#I will add 20 new headlines for each one existing
augmented_sentences=[]
augmented_sentences_labels=[]
temps=[]
for i in X_train.index:
    if y_train[i]==1:
        temps=aug.augment(X_train[i],n=20)
    for sent in temps:
        augmented_sentences.append(sent)
        augmented_sentences_labels.append(1)
        
X_train=X_train.append(pd.Series(augmented_sentences),ignore_index=True)
y_train=y_train.append(pd.Series(augmented_sentences_labels),ignore_index=True)

vectorizer = TfidfVectorizer(min_df=3,analyzer='word',max_features=10000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)
classifier=svm.SVC()

classifier.fit(X_train_vectorized,y_train)


SVC()

In [439]:
y_train_pred=classifier.predict(X_train_vectorized)

print(confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[ 2989     1]
 [    0 59882]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2990
           1       1.00      1.00      1.00     59882

    accuracy                           1.00     62872
   macro avg       1.00      1.00      1.00     62872
weighted avg       1.00      1.00      1.00     62872

0.9999840946685329


In [440]:
y_pred=classifier.predict(X_test_vectorized)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1267    2]
 [  22    9]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1269
           1       0.82      0.29      0.43        31

    accuracy                           0.98      1300
   macro avg       0.90      0.64      0.71      1300
weighted avg       0.98      0.98      0.98      1300

0.9815384615384616


In [None]:
#Precision and recall are always tradeoffs. 
#In this model, precision is perfect but recall is really low. I actually want it to be the other way around 
#where recall is fairly high. 

In [179]:
#Trying Random Forest

news_articles["text"]= news_articles["text"].apply(clean_text)
news_articles = news_articles.sample(frac=1).reset_index(drop=True)
news_articles.head()

X = news_articles['text']
y = news_articles['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#I want no more than 3 synonymns added in each original headline
aug = naw.SynonymAug(aug_src='wordnet',aug_max=3, stopwords=stopwords.words('english'))

#I will add 20 new headlines for each one existing
augmented_sentences=[]
augmented_sentences_labels=[]
temps=[]
for i in X_train.index:
    if y_train[i]==1:
        temps=aug.augment(X_train[i],n=20)
    for sent in temps:
        augmented_sentences.append(sent)
        augmented_sentences_labels.append(1)
        

vectorizer = TfidfVectorizer(min_df=3,analyzer='word',max_features=10000)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)
classifier=RandomForestClassifier()

classifier.fit(X_train_vectorized,y_train)


RandomForestClassifier()

In [180]:
y_train_pred=classifier.predict(X_train_vectorized)

print(confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[2984    0]
 [   0   48]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2984
           1       1.00      1.00      1.00        48

    accuracy                           1.00      3032
   macro avg       1.00      1.00      1.00      3032
weighted avg       1.00      1.00      1.00      3032

1.0


In [181]:
y_pred=classifier.predict(X_test_vectorized)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1275    0]
 [  24    1]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1275
           1       1.00      0.04      0.08        25

    accuracy                           0.98      1300
   macro avg       0.99      0.52      0.53      1300
weighted avg       0.98      0.98      0.97      1300

0.9815384615384616


### Oversampling with SMOTE

In [433]:
#Attempting to oversample with SMOTE and Random Forest

#Caveat: ordinarily, I would only use the train subset from above to cross validate and then test on the test subset, but
#in this case, there is so little data at this moment that the models will perform better if I use the full 
#news_articles sample from above and just split that into train and test. 

#I will have to test on never before seen data as I continue to scrape news articles

#Using the training dataset from above

news_articles["text"]= news_articles["text"].apply(clean_text)
news_articles.head()

X = news_articles['text']
y = news_articles['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

textclassifier = Pipeline([
    ('vect', CountVectorizer()),
    ('smote', SMOTE(random_state=12)),
    ('rf', RandomForestClassifier())
])

textclassifier.fit(X_train, y_train)


Pipeline(steps=[('vect', CountVectorizer()), ('smote', SMOTE(random_state=12)),
                ('rf', RandomForestClassifier())])

In [434]:
y_train_pred=textclassifier.predict(X_train)

print(confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[3829    2]
 [   0   67]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3831
           1       0.97      1.00      0.99        67

    accuracy                           1.00      3898
   macro avg       0.99      1.00      0.99      3898
weighted avg       1.00      1.00      1.00      3898

0.9994869163673679


In [435]:
y_pred=textclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[361  67]
 [  2   4]]
              precision    recall  f1-score   support

           0       0.99      0.84      0.91       428
           1       0.06      0.67      0.10         6

    accuracy                           0.84       434
   macro avg       0.53      0.76      0.51       434
weighted avg       0.98      0.84      0.90       434

0.8410138248847926


In [None]:
#The accuracy went down but the recall is at 67%, the highest it's been! In this problem I care more about recall 
#because recall is the share of all of the true threat/resignation articles that were actually classified as 
#as such. So of all of the true threat/resignation articles, the model predicted 67% of them correctly. 
#That's good if our goal is to be aware of as many of these articles as possible and not let too many of them 
#slip through the cracks.

#The downside is we have quite a few false positives (276) — articles that were classified as threat/resignation
#articles but actually weren't. But I would rather err on the side of having to check more articles.
#We have relatively few false negatives (9) — articles classified as not a threat/resignation articles but actually are
#ones — which is good.


In [347]:
#Oversampling with SMOTE plus SVC

news_articles["text"]= news_articles["text"].apply(clean_text)
news_articles.head()

X = news_articles['text']
y = news_articles['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

textclassifier = Pipeline([
  ('vect', CountVectorizer()),
   ('smote', SMOTE(random_state=12)),
   ('svm', svm.SVC())
])


In [348]:
textclassifier.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('smote', SMOTE(random_state=12)),
                ('svm', SVC())])

In [349]:
y_train_pred=textclassifier.predict(X_train)

print(confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[2982    1]
 [  38   11]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2983
           1       0.92      0.22      0.36        49

    accuracy                           0.99      3032
   macro avg       0.95      0.61      0.68      3032
weighted avg       0.99      0.99      0.98      3032

0.9871372031662269


In [350]:
y_pred=textclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1214   62]
 [  22    2]]
              precision    recall  f1-score   support

           0       0.98      0.95      0.97      1276
           1       0.03      0.08      0.05        24

    accuracy                           0.94      1300
   macro avg       0.51      0.52      0.51      1300
weighted avg       0.96      0.94      0.95      1300

0.9353846153846154


In [None]:
#SVC did not help our performance at all

In [409]:
#Oversampling with SMOTE and Extremely Randomized Trees

news_articles["text"]= news_articles["text"].apply(clean_text)
news_articles.head()

X = news_articles['text']
y = news_articles['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

textclassifier = Pipeline([
  ('vect', CountVectorizer()),
   ('smote', SMOTE(random_state=12)),
   ('etc', ensemble.ExtraTreesClassifier())
])

In [410]:
textclassifier.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('smote', SMOTE(random_state=12)),
                ('etc', ExtraTreesClassifier())])

In [411]:
y_train_pred=textclassifier.predict(X_train)

print(confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[3829    2]
 [   0   67]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3831
           1       0.97      1.00      0.99        67

    accuracy                           1.00      3898
   macro avg       0.99      1.00      0.99      3898
weighted avg       1.00      1.00      1.00      3898

0.9994869163673679


In [412]:
y_pred=textclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[374  54]
 [  2   4]]
              precision    recall  f1-score   support

           0       0.99      0.87      0.93       428
           1       0.07      0.67      0.12         6

    accuracy                           0.87       434
   macro avg       0.53      0.77      0.53       434
weighted avg       0.98      0.87      0.92       434

0.8709677419354839


In [421]:
#Oversampling with SMOTE and Multinomial NaiveBayes

X = train['text']
y = train['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

textclassifier = Pipeline([
  ('vect', CountVectorizer()),
   ('smote', SMOTE(random_state=3)),
   ('etc', MultinomialNB())
])

In [422]:
textclassifier.fit(X_train, y_train)

Pipeline(steps=[('vect', CountVectorizer()), ('smote', SMOTE(random_state=3)),
                ('etc', MultinomialNB())])

In [423]:
y_train_pred=textclassifier.predict(X_train)

print(confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[2081  135]
 [   2   34]]
              precision    recall  f1-score   support

           0       1.00      0.94      0.97      2216
           1       0.20      0.94      0.33        36

    accuracy                           0.94      2252
   macro avg       0.60      0.94      0.65      2252
weighted avg       0.99      0.94      0.96      2252

0.9391651865008881


In [424]:
y_pred=textclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[517  35]
 [  6   5]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       552
           1       0.12      0.45      0.20        11

    accuracy                           0.93       563
   macro avg       0.56      0.70      0.58       563
weighted avg       0.97      0.93      0.95       563

0.9271758436944938


### Combining the synonym method above and SMOTE?

In [479]:
train["text"]= train["text"].apply(clean_text)
train = train.sample(frac=1).reset_index(drop=True)

X = train['text']
y = train['target']

#I want no more than 2 synonymns added in each original headline
aug = naw.SynonymAug(aug_src='wordnet',aug_max=2, stopwords=stopwords.words('english'))

#I will add 20 new headlines for each one existing
augmented_sentences=[]
augmented_sentences_labels=[]
temps=[]
for i in X.index:
    if y[i]==1:
        temps=aug.augment(X[i],n=20)
    for sent in temps:
        augmented_sentences.append(sent)
        augmented_sentences_labels.append(1)
        
X=X.append(pd.Series(augmented_sentences),ignore_index=True)
y=y.append(pd.Series(augmented_sentences_labels),ignore_index=True)


In [480]:
#Now applying SMOTE to the augmented sample

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

textclassifier = Pipeline([
    ('vect', CountVectorizer()),
    ('smote', SMOTE(random_state=12)),
    ('nb', MultinomialNB())
])

textclassifier.fit(X_train, y_train)


Pipeline(steps=[('vect', CountVectorizer()), ('smote', SMOTE(random_state=12)),
                ('nb', RandomForestClassifier())])

In [481]:
y_train_pred=textclassifier.predict(X_train)

print(confusion_matrix(y_train,y_train_pred))
print(classification_report(y_train,y_train_pred))
print(accuracy_score(y_train, y_train_pred))

[[ 2497     0]
 [    0 47142]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2497
           1       1.00      1.00      1.00     47142

    accuracy                           1.00     49639
   macro avg       1.00      1.00      1.00     49639
weighted avg       1.00      1.00      1.00     49639

1.0


In [482]:
y_pred=textclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[ 271    0]
 [   0 5245]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       271
           1       1.00      1.00      1.00      5245

    accuracy                           1.00      5516
   macro avg       1.00      1.00      1.00      5516
weighted avg       1.00      1.00      1.00      5516

1.0


In [483]:
#Testing on not before seen data

test["text"]= test["text"].apply(clean_text)
test = test.sample(frac=1).reset_index(drop=True)

X = test['text']
y = test['target']

y_predicted_test = textclassifier.predict(X)

In [484]:
print(confusion_matrix(y,y_predicted_test))
print(classification_report(y,y_predicted_test))
print(accuracy_score(y,y_predicted_test))

[[1488    3]
 [  24    2]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1491
           1       0.40      0.08      0.13        26

    accuracy                           0.98      1517
   macro avg       0.69      0.54      0.56      1517
weighted avg       0.97      0.98      0.98      1517

0.982201713909031
