In [1]:
import os
import pandas as pd
from training.argparser import argparser
from training.config import Config
from training.trainer import Trainer
import json
# from data.preprocessing import preprocessor_fn

from data.stanford_sentiment import StanfordSentimentDataset
from data.news_category import NewsCategoryDataset
from data.fake_news import FakeNewsDataset
from data.emotion_affect import EmotionAffectDataset

dataset_map = {
  'stan_sent': StanfordSentimentDataset,
  'news_cat': NewsCategoryDataset,
  'fake_news': FakeNewsDataset,
  'emo_aff': EmotionAffectDataset
}

[nltk_data] Downloading package stopwords to /Users/udit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/udit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/udit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
dataset = EmotionAffectDataset()

Training on Emotion Affect Dataset
Reading data...


Pandas Apply: 100%|██████████| 1207/1207 [00:00<00:00, 4731.66it/s]

1207
Done





In [None]:
# dataset = FakeNewsDataset()

Training on Fake News Dataset

Downloading data to data/datasets/fake_news_dataset using the command:
   kaggle competitions download -c fake-news
Not downloading. Data already downloaded

Reading data...


In [26]:
# dataset = StanfordSentimentDataset()

Training on Stanford Sentiment Analysis Dataset

Downloading data to data/datasets/stanford_sentiment using the command:
   kaggle competitions download -c sentiment-analysis-on-movie-reviews
Not downloading. Data already downloaded

Reading data...
Done


In [5]:
# dataset.train_data['Sentiment'].max()

In [30]:
# dataset = NewsCategoryDataset()

Training on News Category Dataset

Downloading data to data/datasets/news_category_dataset using the command:
   kaggle datasets download rmisra/news-category-dataset
Not downloading. Data already downloaded

Reading data...
Done


In [17]:
dataset.split_data(dataset_ratio=1.0, test_size=0.2)


Splitting data...
Done


In [18]:
# len(dataset.data), 
len(dataset.train_data), len(dataset.test_data)

(965, 242)

In [5]:
# total = 0
# for idx in range(dataset.data['X'].values.shape[0]):
#     total += len(dataset.data['X'].values[idx])
# print(total)

In [6]:
# from data.preprocessing import LemmaTokenizer, StemTokenizer

# tokenizer = LemmaTokenizer()

In [11]:
# import time

# start = time.time()
# out = tokenizer(' '.join(['styles' for i in range(1000000)]))
# print('{:.3f} seconds'.format(time.time() - start))

In [12]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.decomposition import PCA

# vect = CountVectorizer(min_df=5, max_features=5000)
# X_train = vect.fit_transform(dataset.train_data['X'])
# print(X_train.shape)

(965, 596)


In [13]:
# X_train = PCA(n_components=1000).fit_transform(X_train.todense())
# print(X_train.shape)

In [7]:
# trainer = Trainer(dataset=dataset, models=['mnb', 'svm', 'lr', 'ada', 'rf'], transforms=['bow', 'tfidf', 'ngram'])

trainer = Trainer(dataset=dataset, models=['mnb', 'lr'], transforms=['bow', 'tfidf'], grid=False)

In [8]:
# trainer2 = Trainer(dataset, models=['lr'], transforms=['bow'])

# trainer2.train()

In [9]:
# trainer.transformed['bow']['X_train'].shape

In [10]:
X_train, y_train = trainer.get_train_data()

In [11]:
X_train.shape, y_train.shape

((965,), (965,))

In [12]:
trainer.train()

[92m05-17 04:36:19[0m Training mnb with bow transformation
[92m05-17 04:36:19[0m Training mnb with tfidf transformation
[92m05-17 04:36:19[0m Training lr with bow transformation
[92m05-17 04:36:19[0m Training lr with tfidf transformation


Training mnb with bow transformation
[Pipeline] .......... (step 1 of 2) Processing tranform, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.0s
Training mnb with tfidf transformation
[Pipeline] .......... (step 1 of 2) Processing tranform, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.0s
Training lr with bow transformation
[Pipeline] .......... (step 1 of 2) Processing tranform, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.0s
Training lr with tfidf transformation
[Pipeline] .......... (step 1 of 2) Processing tranform, total=   0.0s
[Pipeline] ............. (step 2 of 2) Processing model, total=   0.0s


In [13]:
df = trainer.evaluate()

In [14]:
df

Unnamed: 0,model,transform,precision,accuracy
0,mnb,bow,0.812435,0.812435
1,mnb,tfidf,0.729534,0.729534
2,lr,bow,0.906736,0.906736
3,lr,tfidf,0.812435,0.812435


In [15]:
# trainer.gridsearch['svm']['bow'].best_params_

In [16]:
trainer.save_best(df)

[92m05-17 04:36:22[0m ['The best model precision is 0.91 ']
[92m05-17 04:36:22[0m ['The best model is lr ']
[92m05-17 04:36:22[0m ['The best feature transformation is bow ']
[92m05-17 04:36:22[0m ['The best model configuration is ', 'dataset: news_cat, model: lr, feats: bow, save_path: bow_0, continue_train: False, load_path: None, test: None, params: None']
[92m05-17 04:36:22[0m   model transform  precision  accuracy
0   mnb       bow   0.812435  0.812435
1   mnb     tfidf   0.729534  0.729534
2    lr       bow   0.906736  0.906736
3    lr     tfidf   0.812435  0.812435
[92m05-17 04:36:22[0m Saving configuration:
[92m05-17 04:36:22[0m dataset: news_cat, model: lr, feats: bow, save_path: bow_0, continue_train: False, load_path: None, test: None, params: None
[92m05-17 04:36:22[0m Saving done


In [23]:
print(df.to_markdown())

|    | model   | transform   |   precision |   accuracy |
|---:|:--------|:------------|------------:|-----------:|
|  0 | mnb     | bow         |    0.801036 |   0.801036 |
|  1 | mnb     | tfidf       |    0.731606 |   0.731606 |
|  2 | mnb     | ngram       |    0.801036 |   0.801036 |
|  3 | svm     | bow         |    0.982383 |   0.982383 |
|  4 | svm     | tfidf       |    0.941969 |   0.941969 |
|  5 | svm     | ngram       |    0.864249 |   0.864249 |
|  6 | lr      | bow         |    0.906736 |   0.906736 |
|  7 | lr      | tfidf       |    0.801036 |   0.801036 |
|  8 | lr      | ngram       |    0.905699 |   0.905699 |
|  9 | xgb     | bow         |    0.772021 |   0.772021 |
| 10 | xgb     | tfidf       |    0.846632 |   0.846632 |
| 11 | xgb     | ngram       |    0.770984 |   0.770984 |
| 12 | ada     | bow         |    0.541969 |   0.541969 |
| 13 | ada     | tfidf       |    0.60829  |   0.60829  |
| 14 | ada     | ngram       |    0.541969 |   0.541969 |
| 15 | rf     

In [39]:
print(df.to_latex())

\begin{tabular}{lllrrr}
\toprule
{} & model & transform &  precision &    recall &  f1-score \\
\midrule
0 &   mnb &       bow &   0.804145 &  0.804145 &  0.804145 \\
1 &   svm &       bow &   0.872539 &  0.872539 &  0.872539 \\
2 &    lr &       bow &   0.912953 &  0.912953 &  0.912953 \\
\bottomrule
\end{tabular}



In [18]:
y_pred = trainer.gridsearch['lr']['bow'].predict(X_train)

In [19]:
y_pred

array([1, 0, 0, 0, 0, 1, 2, 2, 2, 1, 1, 1, 5, 2, 2, 2, 2, 2, 1, 1, 1, 4,
       0, 5, 0, 0, 1, 2, 1, 1, 1, 1, 2, 0, 0, 1, 1, 1, 5, 0, 4, 5, 5, 0,
       0, 5, 0, 5, 4, 4, 2, 4, 0, 1, 2, 1, 1, 0, 5, 0, 2, 2, 0, 5, 0, 5,
       1, 2, 4, 1, 1, 5, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0,
       2, 2, 0, 1, 1, 1, 1, 1, 5, 2, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 5,
       2, 0, 1, 1, 1, 1, 0, 0, 2, 0, 1, 4, 1, 0, 2, 1, 4, 4, 4, 2, 0, 2,
       2, 1, 1, 4, 0, 1, 2, 2, 5, 0, 1, 2, 5, 1, 2, 2, 2, 2, 0, 0, 0, 0,
       2, 0, 2, 2, 2, 2, 2, 0, 0, 1, 2, 0, 0, 2, 2, 2, 4, 2, 2, 2, 4, 2,
       4, 4, 1, 1, 1, 2, 0, 1, 2, 5, 2, 0, 0, 1, 0, 1, 2, 2, 5, 5, 2, 2,
       0, 0, 0, 0, 0, 4, 4, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 1,
       2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 4, 2, 2, 5, 0, 2, 0, 0, 4,
       4, 4, 0, 4, 4, 4, 4, 4, 0, 2, 0, 0, 0, 0, 0, 2, 2, 2, 4, 4, 4, 4,
       4, 4, 4, 4, 2, 4, 5, 2, 2, 2, 2, 4, 4, 0, 5, 0, 2, 2, 2, 1, 0, 0,
       0, 0, 0, 2, 2, 2, 2, 4, 4, 0, 0, 0, 0, 2, 2,

In [15]:
import pickle

best_pipeline = grid_search.best_estimator_
with open('test.pkl', 'wb') as fw:
    pickle.dump(best_pipeline, fw)

In [35]:
import pickle

with open('test.pkl', 'rb') as f:
    loaded_pipeline = pickle.load(f)

In [17]:
loaded_pipeline

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 SGDClassifier(alpha=1e-06, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=20, n_

In [28]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

params = {'stop_words':'english', 'min_df':5}

vect = CountVectorizer(**params)

In [29]:
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=5,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [23]:
import pandas as pd

data = pd.read_csv('covid19_articles/covid_19_articles.csv')

In [59]:
# data

In [60]:
# data['text'][0]

In [36]:
loaded_pipeline

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 SGDClassifier(alpha=1e-06, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=20, n_

In [40]:
loaded_pipeline.predict(data['text'])

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [43]:
data['Emotion 1-7']

0     Angry-Disgusted
1     Angry-Disgusted
2               HAPPY
3               HAPPY
4               HAPPY
5           Surprised
6           Surprised
7             Fearful
8                 Sad
9             Fearful
10    Angry-Disgusted
11          Surprised
12          Surprised
13                Sad
14            Fearful
15          Surprised
16            Fearful
17          Surprised
18          Surprised
19            Fearful
Name: Emotion 1-7, dtype: object

In [53]:
from nltk import sent_tokenize
from scipy import stats


article0 = data['text'][0]

def eval_emotion(article_text):
    sentences = sent_tokenize(article_text)
    pred_emotions = loaded_pipeline.predict(sentences)
    final_emotion = stats.mode(pred_emotions).mode[0]
    return final_emotion

print(eval_emotion(article0))

1


In [None]:
def evaluate(article_text):
    # load all 4 models
    # run predictions on all 4 models
    # return a dict {"emotion": pred_emotion, "category": pred_category, "fake": pred_fake, "sentiment": pred_sentiment}


In [36]:
!ls output/model_dump

emo.model           emo_aff.csv         emo_aff_clean.csv   emo_aff_results.csv


In [14]:
results = pd.read_csv('output/model_dump/emo_aff_results.csv')

In [16]:
print(results.to_latex())

\begin{tabular}{lllrr}
\toprule
{} &      model & transform &  precision &  accuracy \\
\midrule
0 &         lr &       bow &   0.906736 &  0.906736 \\
1 &         lr &     ngram &   0.912953 &  0.912953 \\
2 &  linearsvm &       bow &   0.960622 &  0.960622 \\
3 &  linearsvm &     ngram &   0.962694 &  0.962694 \\
\bottomrule
\end{tabular}



In [28]:
clean_data = pd.read_csv('output/model_dump/emo_aff_clean.csv')

In [29]:
clean_data

Unnamed: 0,X,y,index
0,it is very unpleasant i am afraid of the polic...,1,35
1,pickle nearly had a fit he barked and he barke...,0,46
2,he shut the door in nutkins face,0,24
3,old mr brown turned up his eye in disgust at t...,0,51
4,and to this day if you meet nutkin up a tree a...,0,77
...,...,...,...
1202,ah said the father what fear we have had for you,2,98
1203,yes father answered he i have travelled all ov...,2,99
1204,well said they you are come back and we will n...,2,102
1205,then they hugged and kissed their dear little ...,2,103


In [43]:
dataset = StanfordSentimentDataset(do_clean=False)
data_path = 'output/model_dump/stan_sent_clean.csv'
if os.path.exists(data_path):
    clean_data = pd.read_csv(data_path)
    dataset.data = clean_data

Training on Stanford Sentiment Analysis Dataset

Downloading data to data/datasets/stanford_sentiment using the command:
   kaggle competitions download -c sentiment-analysis-on-movie-reviews
Not downloading. Data already downloaded

Reading data...
Done


In [48]:
clean_data

Unnamed: 0,X,y
0,a series of escapade demonstrating the adage t...,1.0
1,a series of escapade demonstrating the adage t...,2.0
2,a series,2.0
3,a,2.0
4,series,2.0
...,...,...
156055,hearst s,2.0
156056,forced avuncular chortle,1.0
156057,avuncular chortle,3.0
156058,avuncular,2.0


In [44]:
dataset.data.dropna()

X    159
y      0
dtype: int64

In [54]:
dataset = FakeNewsDataset(do_clean=False)
data_path = 'output/model_dump/fake_news_clean.csv'
if os.path.exists(data_path):
    clean_data = pd.read_csv(data_path)
    dataset.data = clean_data

Training on Fake News Dataset

Downloading data to data/datasets/fake_news_dataset using the command:
   kaggle competitions download -c fake-news
Not downloading. Data already downloaded

Reading data...
Done


In [57]:
dataset.data.dropna()

Unnamed: 0,X,y
0,house dem aide we didnt even see comeys letter...,1
1,flynn hillary clinton big woman on campus brei...,0
2,why the truth might get you fired why the trut...,1
3,civilian killed in single u airstrike have bee...,1
4,iranian woman jailed for fictional unpublished...,1
...,...,...
20198,rapper ti trump a poster child for white supre...,0
20199,nfl playoff schedule matchup and odds the new ...,0
20200,macys is said to receive takeover approach by ...,0
20201,nato russia to hold parallel exercise in balka...,1


In [58]:
dataset.split_data(test_size=0.2)


Splitting data...
Done


In [59]:
dataset.test_data

Unnamed: 0,X,y
16162,clinton id add michelle obama to my cabinet ho...,1
16163,johnny nicholson whose midtown cafe drew the n...,0
16164,reichsbrger schlagen zu sind beamte fr reichsi...,1
16165,an open letter to black south african police o...,1
16166,u investigating mosul strike said to have kill...,0
...,...,...
20198,rapper ti trump a poster child for white supre...,0
20199,nfl playoff schedule matchup and odds the new ...,0
20200,macys is said to receive takeover approach by ...,0
20201,nato russia to hold parallel exercise in balka...,1


In [60]:
dataset.train_data

Unnamed: 0,X,y
0,house dem aide we didnt even see comeys letter...,1
1,flynn hillary clinton big woman on campus brei...,0
2,why the truth might get you fired why the trut...,1
3,civilian killed in single u airstrike have bee...,1
4,iranian woman jailed for fictional unpublished...,1
...,...,...
16157,suicide bomb attack target baghdad market afp ...,0
16158,watch mandy moore is shark bait in meter down ...,0
16159,re wannnh the left melt down after fbi reopens...,1
16160,donald trump call a black supporter a paid thu...,1
