In [31]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from gensim.models.doc2vec import Doc2Vec,\
    TaggedDocument
from nltk.tokenize import word_tokenize



### Get the data

In [32]:
reddit_combi_df = pd.read_csv(
    "cleaned_data/Reddit_Combi_cleaned.csv",   
    )

reddit_title_df = pd.read_csv(    
    "cleaned_data/Reddit_Title_cleaned.csv",    
)

twitter_full_df = pd.read_csv(
    "cleaned_data/Twitter_Full_cleaned.csv",  
    )

twitter_non_advert = pd.read_csv(
    "cleaned_data/Twitter_Non-Advert_cleaned.csv",
    )

df_files = [reddit_combi_df, reddit_title_df, twitter_full_df, twitter_non_advert]

In [33]:
for df in df_files:
    print(df.columns)

Index(['title', 'body', 'Body_Title', 'label'], dtype='object')
Index(['title', 'label'], dtype='object')
Index(['text', 'hashtags', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')


In [34]:
# Rename column name for consistency
df_files[0] = df_files[0].rename(columns={"Body_Title": "body_title"})
df_files[0].columns

Index(['title', 'body', 'body_title', 'label'], dtype='object')

In [35]:
# merge columns in dataframed containing text
df_files[0]["full_text"] = (
    df_files[0]["title"].fillna("") + " " +
    df_files[0]["body"].fillna("") + " " +
    df_files[0]["body_title"].fillna("") + " " 
)

df_files[1]["full_text"] = (
    df_files[1]["title"].fillna("") + " "
)

df_files[2]["full_text"] = (
    df_files[2]["text"].fillna("") + " " +
    df_files[2]["hashtags"].astype(str).fillna("")
)

df_files[3]["full_text"] = (
    df_files[3]["text"].fillna("") + " " 
)


In [36]:
# clean hashtag symbols from full text, hashtags should only be in Twitter full, but clean from all to maintain consistency
def clean_text(text):
    text = str(text) 
    text = re.sub(r"[\[\]']", "", text)  
    text = text.replace("#", "") 
    return text

for df in df_files:
    df["full_text"] = df["full_text"].apply(clean_text)

In [37]:
for df in df_files:
    print(df.columns)

Index(['title', 'body', 'body_title', 'label', 'full_text'], dtype='object')
Index(['title', 'label', 'full_text'], dtype='object')
Index(['text', 'hashtags', 'label', 'full_text'], dtype='object')
Index(['text', 'label', 'full_text'], dtype='object')


In [38]:
# check full_text is ok

df_files[0].head()

Unnamed: 0,title,body,body_title,label,full_text
0,envy to other is swallowing me,"im from developingcountry, indonesia , and for...",envy to other is swallowing me im from develop...,1,envy to other is swallowing me im from develop...
1,nothin outta the ordinary. paradise. job stres...,um hello .well many can relate im sure. after ...,nothin outta the ordinary. paradise. job stres...,1,nothin outta the ordinary. paradise. job stres...
2,almost 49 and the chasm of emptiness has never...,i’ve been diagnosed severe bi polar where you ...,almost 49 and the chasm of emptiness has never...,1,almost 49 and the chasm of emptiness has never...
3,i’m happy again,"after my closest friend left me in april, i ha...",i’m happy again after my closest friend left m...,0,i’m happy again after my closest friend left m...
4,is it possible to recover from such a traumati...,"i am only 15, and yet i feel my life is alread...",is it possible to recover from such a traumati...,1,is it possible to recover from such a traumati...


### Merge dataframes, so they contain only one feature column "full_text"

In [39]:
# create new dataframed containing only full_text and label for all dataframes

reddit1 = df_files[0].copy()
reddit2 = df_files[1].copy()
twitter1 = df_files[2].copy()
twitter2 = df_files[3].copy()

reddit1 = reddit1.drop(columns=['title', 'body', 'body_title'])
reddit2 = reddit2.drop(columns=['title'])
twitter1 = twitter1.drop(columns=['text', 'hashtags'])
twitter2 = twitter2.drop(columns=['text'])

df_files_new = [reddit1, reddit2, twitter1, twitter2]


In [40]:
for df in df_files_new:
    print(len(df))
    print(df.columns)

3123
Index(['label', 'full_text'], dtype='object')
5480
Index(['label', 'full_text'], dtype='object')
8525
Index(['label', 'full_text'], dtype='object')
1972
Index(['label', 'full_text'], dtype='object')


## Multinomial Naive Bayesian

- Is selected as a model, because it's widely used in text classification tasks (https://towardsdatascience.com/multinomial-naive-bayes-for-documents-classification-and-natural-language-processing-nlp-e08cc848ce6/)

## Training data

- Three models are trained and data splits are the following
    - training: reddit, test and validation: twitter
        - how different datasets generalize
    - training: twitter, test and validation: reddit
        - how different datasets generalize
    - training: 80% of all datasets, test and validation: 20% of all datasets
        - typical ML data split

In [41]:
# Split data as above

X_train1 = pd.concat([reddit1, reddit2])["full_text"]
y_train1 = pd.concat([reddit1, reddit2])["label"]
X_test1 = pd.concat([twitter1, twitter2])["full_text"]
y_test1 = pd.concat([twitter1, twitter2])["label"]

X_train2 = pd.concat([twitter1, twitter2])["full_text"]
y_train2 = pd.concat([twitter1, twitter2])["label"]
X_test2 = pd.concat([reddit1, reddit2])["full_text"]
y_test2 = pd.concat([reddit1, reddit2])["label"]

X_3 = pd.concat([reddit1, reddit2, twitter1, twitter2])["full_text"]
y_3 = pd.concat([reddit1, reddit2, twitter1, twitter2])["label"]

X_train3, X_test3, y_train3, y_test3 = train_test_split(X_3, y_3, test_size=0.2, random_state=42)

In [42]:
# check some lengths
print(len(X_train1))
print(len(X_test1))
print(len(X_train3))

8603
10497
15280


## Train the three multinomial NB models

In [43]:
# Pipeline for embedding + training
# TfidfVectorizer collects a document of TF-IDF features, used in text classification: https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py

pip = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',       # remove short, meaningless words
        ngram_range=(1,2)        # to make dict smaller
    )),
    ('nb', MultinomialNB())
])

In [44]:
# train: reddit, test: twitter
pip.fit(X_train1, y_train1)
y_pred1 = pip.predict(X_test1)
acc_MNB_1 = accuracy_score(y_test1, y_pred1)
print("Accuracy:", acc_MNB_1)
print(classification_report(y_test1, y_pred1))

Accuracy: 0.6674287891778603
              precision    recall  f1-score   support

           0       0.87      0.34      0.49      4924
           1       0.62      0.95      0.75      5573

    accuracy                           0.67     10497
   macro avg       0.75      0.65      0.62     10497
weighted avg       0.74      0.67      0.63     10497



In [45]:
# train: twitter, test: reddit
pip.fit(X_train2, y_train2)
y_pred2 = pip.predict(X_test2)
acc_MNB_2 = accuracy_score(y_test2, y_pred2)
print("Accuracy:", acc_MNB_2)
print(classification_report(y_test2, y_pred2))

Accuracy: 0.8000697431128676
              precision    recall  f1-score   support

           0       0.88      0.53      0.66      3189
           1       0.78      0.96      0.86      5414

    accuracy                           0.80      8603
   macro avg       0.83      0.75      0.76      8603
weighted avg       0.81      0.80      0.79      8603



In [46]:
# train: 80%, test: 20%
pip.fit(X_train3, y_train3)
y_pred3 = pip.predict(X_test3)
acc_MNB_3 = accuracy_score(y_test3, y_pred3)
print("Accuracy:", acc_MNB_3)
print(classification_report(y_test3, y_pred3))

Accuracy: 0.8109947643979057
              precision    recall  f1-score   support

           0       0.91      0.62      0.74      1630
           1       0.77      0.96      0.85      2190

    accuracy                           0.81      3820
   macro avg       0.84      0.79      0.79      3820
weighted avg       0.83      0.81      0.80      3820



In [47]:
print("Accuracies for Multinominal Naive Bayesian:")
print("train: reddit, test: twitter, accuracy: ", round(acc_MNB_1, 3))
print("train: twitter, test: reddit, accuracy: ", round(acc_MNB_2, 3))
print("train: 80%, test: 20%, accuracy: ", round(acc_MNB_3, 3))

Accuracies for Multinominal Naive Bayesian:
train: reddit, test: twitter, accuracy:  0.667
train: twitter, test: reddit, accuracy:  0.8
train: 80%, test: 20%, accuracy:  0.811


- Third dataset split peformed the best, receiving highest accuracy and f1-score (or tie with second model)
- Twitter data as a dataset performed significantly better than reddit data, suggesting the quality of twitter data is better
    - Twitter data generalizes better to new observations
    - Twitter data itself is almost as good in quality than 80% of all datasets, suggesting twitter data could be used as it's own in model training


# SVM and Logstic Regression

### Compare models and two embeddings: TF-IDF and Doc2Vec
- TF-IDF creates matrix of tf-idf features (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
- Doc2Vec learns embeddings of words, numerical vectors (https://www.geeksforgeeks.org/nlp/doc2vec-in-nlp/)

### Training and testing data
- Use same data splits as in NBC

In [48]:
# TF-IDF + Logistic regression
TF_IDF_log_pip = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',       # remove short, meaningless words
        ngram_range=(1,2)        # to make dict smaller
    )),
    ('nb', LogisticRegression())
])

In [49]:
# TF-IDF + SVC
TF_IDF_svm_pip = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',       # remove short, meaningless words
        ngram_range=(1,2)        # to make dict smaller
    )),
    ('nb', svm.SVC())
])

### Doc2Vec processing 
- Vectorize words only once, because it takes over one minute

In [55]:
def tokenize(text):
    return str(text).lower().split()

def doc2vec(X_train, X_test):
    tagged_train = [TaggedDocument(words=tokenize(doc), tags=[str(i)]) for i, doc in enumerate(X_train)]

    # train Doc2Vev
    model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
    model.build_vocab(tagged_train)
    model.train(tagged_train, total_examples=model.corpus_count, epochs=model.epochs)

    # get document vectors
    train_vec = [model.dv[str(i)] for i in range(len(tagged_train))]
    test_vec = [model.infer_vector(tokenize(doc)) for doc in X_test]

    return np.array(train_vec), np.array(test_vec)

In [None]:
X_train_1_d2v, X_test_1_d2v = doc2vec(X_train1.astype(str).tolist(), X_test1.astype(str).tolist())

(8603, 20)
(10497, 20)


In [57]:
X_train_2_d2v, X_test_2_d2v = doc2vec(X_train2.astype(str).tolist(), X_test2.astype(str).tolist())

In [58]:
X_train_3_d2v, X_test_3_d2v = doc2vec(X_train3.astype(str).tolist(), X_test3.astype(str).tolist())

### TF-IDF tokenizer

In [None]:
# train: reddit, test: twitter
TF_IDF_log_pip.fit(X_train1, y_train1)
y_pred1_1 = TF_IDF_log_pip.predict(X_test1)

TF_IDF_svm_pip.fit(X_train1, y_train1)
y_pred1_2 = TF_IDF_svm_pip.predict(X_test1)

Accuracy: 0.7376393255215776
              precision    recall  f1-score   support

           0       0.74      0.68      0.71      4924
           1       0.74      0.79      0.76      5573

    accuracy                           0.74     10497
   macro avg       0.74      0.73      0.74     10497
weighted avg       0.74      0.74      0.74     10497

Accuracy: 0.7329713251405163
              precision    recall  f1-score   support

           0       0.72      0.71      0.71      4924
           1       0.75      0.75      0.75      5573

    accuracy                           0.73     10497
   macro avg       0.73      0.73      0.73     10497
weighted avg       0.73      0.73      0.73     10497



In [None]:
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test1, y_pred1_1))
print(classification_report(y_test1, y_pred1_1))

print("SVM")
print("Accuracy:", accuracy_score(y_test1, y_pred1_2))
print(classification_report(y_test1, y_pred1_2))

Logistic Regression
Accuracy: 0.7376393255215776
              precision    recall  f1-score   support

           0       0.74      0.68      0.71      4924
           1       0.74      0.79      0.76      5573

    accuracy                           0.74     10497
   macro avg       0.74      0.73      0.74     10497
weighted avg       0.74      0.74      0.74     10497

SVM
Accuracy: 0.7329713251405163
              precision    recall  f1-score   support

           0       0.72      0.71      0.71      4924
           1       0.75      0.75      0.75      5573

    accuracy                           0.73     10497
   macro avg       0.73      0.73      0.73     10497
weighted avg       0.73      0.73      0.73     10497



In [None]:
# train: twitter, test: reddit
TF_IDF_log_pip.fit(X_train2, y_train2)
y_pred2_1 = TF_IDF_log_pip.predict(X_test2)

TF_IDF_svm_pip.fit(X_train2, y_train2)
y_pred2_2 = TF_IDF_svm_pip.predict(X_test2)

In [None]:
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test2, y_pred2_1))
print(classification_report(y_test2, y_pred2_1))

print("SVM")
print("Accuracy:", accuracy_score(y_test2, y_pred2_2))
print(classification_report(y_test2, y_pred2_2))

Logistic Regression
Accuracy: 0.790305707311403
              precision    recall  f1-score   support

           0       0.72      0.71      0.71      3189
           1       0.83      0.84      0.83      5414

    accuracy                           0.79      8603
   macro avg       0.78      0.77      0.77      8603
weighted avg       0.79      0.79      0.79      8603

SVM
Accuracy: 0.8004184586772056
              precision    recall  f1-score   support

           0       0.78      0.64      0.70      3189
           1       0.81      0.90      0.85      5414

    accuracy                           0.80      8603
   macro avg       0.80      0.77      0.78      8603
weighted avg       0.80      0.80      0.80      8603



In [None]:
# train: 80&, test: 20%
TF_IDF_log_pip.fit(X_train3, y_train3)
y_pred3_1 = TF_IDF_log_pip.predict(X_test3)

TF_IDF_svm_pip.fit(X_train3, y_train3)
y_pred3_2 = TF_IDF_svm_pip.predict(X_test3)

In [None]:
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test3, y_pred3_1))
print(classification_report(y_test3, y_pred3_1))

print("SVM")
print("Accuracy:", accuracy_score(y_test3, y_pred3_2))
print(classification_report(y_test3, y_pred3_2))

Logistic Regression
Accuracy: 0.8646596858638743
              precision    recall  f1-score   support

           0       0.88      0.79      0.83      1630
           1       0.86      0.92      0.89      2190

    accuracy                           0.86      3820
   macro avg       0.87      0.86      0.86      3820
weighted avg       0.87      0.86      0.86      3820

SVM
Accuracy: 0.8801047120418848
              precision    recall  f1-score   support

           0       0.89      0.82      0.85      1630
           1       0.88      0.92      0.90      2190

    accuracy                           0.88      3820
   macro avg       0.88      0.87      0.88      3820
weighted avg       0.88      0.88      0.88      3820



### Doc2Vec Tokenizer

In [83]:
log_reg_model = LogisticRegression()
SVC_model = svm.SVC()

log_reg_model_2 = LogisticRegression()
SVC_model_2 = svm.SVC()

log_reg_model_3 = LogisticRegression()
SVC_model_3 = svm.SVC()

### Training data: Reddit, testing data: Twitter

In [80]:
log_reg_model.fit(X_train_1_d2v, y_train1)
y_pred1_lr_d2v = log_reg_model.predict(X_test_1_d2v) 

SVC_model.fit(X_train_1_d2v, y_train1)
y_pred1_svc_d2v = SVC_model.predict(X_test_1_d2v) 

In [81]:
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test1, y_pred1_lr_d2v))
print(classification_report(y_test1, y_pred1_lr_d2v))

print("SVM")
print("Accuracy:", accuracy_score(y_test1, y_pred1_svc_d2v))
print(classification_report(y_test1, y_pred1_svc_d2v))

Logistic Regression
Accuracy: 0.6080784986186529
              precision    recall  f1-score   support

           0       0.56      0.76      0.65      4924
           1       0.69      0.47      0.56      5573

    accuracy                           0.61     10497
   macro avg       0.63      0.62      0.60     10497
weighted avg       0.63      0.61      0.60     10497

SVM
Accuracy: 0.5932171096503763
              precision    recall  f1-score   support

           0       0.54      0.83      0.66      4924
           1       0.72      0.38      0.50      5573

    accuracy                           0.59     10497
   macro avg       0.63      0.61      0.58     10497
weighted avg       0.64      0.59      0.57     10497



### Training data: Twitter, testing data: Reddit

In [84]:
log_reg_model_2.fit(X_train_2_d2v, y_train2)
y_pred2_lr_d2v = log_reg_model_2.predict(X_test_2_d2v) 

SVC_model_2.fit(X_train_2_d2v, y_train2)
y_pred2_svc_d2v = SVC_model_2.predict(X_test_2_d2v) 

In [85]:
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test2, y_pred2_lr_d2v))
print(classification_report(y_test2, y_pred2_lr_d2v))

print("SVM")
print("Accuracy:", accuracy_score(y_test2, y_pred2_svc_d2v))
print(classification_report(y_test2, y_pred2_svc_d2v))

Logistic Regression
Accuracy: 0.7636870859002673
              precision    recall  f1-score   support

           0       0.73      0.57      0.64      3189
           1       0.78      0.88      0.82      5414

    accuracy                           0.76      8603
   macro avg       0.75      0.72      0.73      8603
weighted avg       0.76      0.76      0.76      8603

SVM
Accuracy: 0.5575961873764965
              precision    recall  f1-score   support

           0       0.43      0.60      0.50      3189
           1       0.69      0.54      0.60      5414

    accuracy                           0.56      8603
   macro avg       0.56      0.57      0.55      8603
weighted avg       0.59      0.56      0.57      8603



### Training data: 80%, testing data: 20%

In [86]:
log_reg_model_3.fit(X_train_3_d2v, y_train3)
y_pred3_lr_d2v = log_reg_model_3.predict(X_test_3_d2v) 

SVC_model_3.fit(X_train_3_d2v, y_train3)
y_pred3_svc_d2v = SVC_model_3.predict(X_test_3_d2v) 

In [87]:
print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test3, y_pred3_lr_d2v))
print(classification_report(y_test3, y_pred3_lr_d2v))

print("SVM")
print("Accuracy:", accuracy_score(y_test3, y_pred3_svc_d2v))
print(classification_report(y_test3, y_pred3_svc_d2v))

Logistic Regression
Accuracy: 0.7416230366492147
              precision    recall  f1-score   support

           0       0.73      0.62      0.67      1630
           1       0.75      0.83      0.79      2190

    accuracy                           0.74      3820
   macro avg       0.74      0.73      0.73      3820
weighted avg       0.74      0.74      0.74      3820

SVM
Accuracy: 0.7863874345549738
              precision    recall  f1-score   support

           0       0.79      0.68      0.73      1630
           1       0.78      0.87      0.82      2190

    accuracy                           0.79      3820
   macro avg       0.79      0.77      0.78      3820
weighted avg       0.79      0.79      0.78      3820

