In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from gensim.models.doc2vec import Doc2Vec,\
    TaggedDocument
from nltk.tokenize import word_tokenize


### Get the data

In [3]:
reddit_combi_df = pd.read_csv(
    "Reddit_Combi_cleaned.csv",   
    )

reddit_title_df = pd.read_csv(    
    "Reddit_Title_cleaned.csv",    
)

twitter_full_df = pd.read_csv(
    "Twitter_Full_cleaned.csv",  
    )

twitter_non_advert = pd.read_csv(
    "Twitter_Non-Advert_cleaned.csv",
    )

df_files = [reddit_combi_df, reddit_title_df, twitter_full_df, twitter_non_advert]

In [4]:
for df in df_files:
    print(df.columns)

Index(['title', 'body', 'Body_Title', 'label'], dtype='object')
Index(['title', 'label'], dtype='object')
Index(['text', 'hashtags', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')


In [5]:
# Rename column name for consistency
df_files[0] = df_files[0].rename(columns={"Body_Title": "body_title"})
df_files[0].columns

Index(['title', 'body', 'body_title', 'label'], dtype='object')

In [6]:
# merge columns in dataframed containing text
df_files[0]["full_text"] = (
    df_files[0]["title"].fillna("") + " " +
    df_files[0]["body"].fillna("") + " " +
    df_files[0]["body_title"].fillna("") + " " 
)

df_files[1]["full_text"] = (
    df_files[1]["title"].fillna("") + " "
)

df_files[2]["full_text"] = (
    df_files[2]["text"].fillna("") + " " +
    df_files[2]["hashtags"].astype(str).fillna("")
)

df_files[3]["full_text"] = (
    df_files[3]["text"].fillna("") + " " 
)


In [7]:
for df in df_files:
    print(df.columns)

Index(['title', 'body', 'body_title', 'label', 'full_text'], dtype='object')
Index(['title', 'label', 'full_text'], dtype='object')
Index(['text', 'hashtags', 'label', 'full_text'], dtype='object')
Index(['text', 'label', 'full_text'], dtype='object')


In [12]:
# check full_text is ok

df_files[0].head()

Unnamed: 0,title,body,body_title,label,full_text
0,envy to other is swallowing me,"im from developingcountry, indonesia , and for...",envy to other is swallowing me im from develop...,1,envy to other is swallowing me im from develop...
1,nothin outta the ordinary. paradise. job stres...,um hello .well many can relate im sure. after ...,nothin outta the ordinary. paradise. job stres...,1,nothin outta the ordinary. paradise. job stres...
2,almost 49 and the chasm of emptiness has never...,i’ve been diagnosed severe bi polar where you ...,almost 49 and the chasm of emptiness has never...,1,almost 49 and the chasm of emptiness has never...
3,i’m happy again,"after my closest friend left me in april, i ha...",i’m happy again after my closest friend left m...,0,i’m happy again after my closest friend left m...
4,is it possible to recover from such a traumati...,"i am only 15, and yet i feel my life is alread...",is it possible to recover from such a traumati...,1,is it possible to recover from such a traumati...


### Merge dataframes, so they contain only one feature column "full_text"

In [14]:
# create new dataframed containing only full_text and label for all dataframes

reddit1 = df_files[0].copy()
reddit2 = df_files[1].copy()
twitter1 = df_files[2].copy()
twitter2 = df_files[3].copy()

reddit1 = reddit1.drop(columns=['title', 'body', 'body_title'])
reddit2 = reddit2.drop(columns=['title'])
twitter1 = twitter1.drop(columns=['text', 'hashtags'])
twitter2 = twitter2.drop(columns=['text'])

df_files_new = [reddit1, reddit2, twitter1, twitter2]


In [20]:
for df in df_files_new:
    print(len(df))
    print(df.columns)

3123
Index(['label', 'full_text'], dtype='object')
5480
Index(['label', 'full_text'], dtype='object')
8525
Index(['label', 'full_text'], dtype='object')
1972
Index(['label', 'full_text'], dtype='object')


## Multinomial Naive Bayesian

- Is selected as a model, because it's widely used in text classification tasks (https://towardsdatascience.com/multinomial-naive-bayes-for-documents-classification-and-natural-language-processing-nlp-e08cc848ce6/)

## Training data

- Three models are trained and data splits are the following
    - training: reddit, test and validation: twitter
        - how different datasets generalize
    - training: twitter, test and validation: reddit
        - how different datasets generalize
    - training: 80% of all datasets, test and validation: 20% of all datasets
        - typical ML data split

In [27]:
# Split data as above

X_train1 = pd.concat([reddit1, reddit2])["full_text"]
y_train1 = pd.concat([reddit1, reddit2])["label"]
X_test1 = pd.concat([twitter1, twitter2])["full_text"]
y_test1 = pd.concat([twitter1, twitter2])["label"]

X_train2 = pd.concat([twitter1, twitter2])["full_text"]
y_train2 = pd.concat([twitter1, twitter2])["label"]
X_test2 = pd.concat([reddit1, reddit2])["full_text"]
y_test2 = pd.concat([reddit1, reddit2])["label"]

X_3 = pd.concat([reddit1, reddit2, twitter1, twitter2])["full_text"]
y_3 = pd.concat([reddit1, reddit2, twitter1, twitter2])["label"]

X_train3, X_test3, y_train3, y_test3 = train_test_split(X_3, y_3, test_size=0.2, random_state=42)

In [19]:
# check some lengths
print(len(X_train1))
print(len(X_test1))
print(len(X_train3))

8603
10497
13370


## Train the three multinomial NB models

In [None]:
# Pipeline for embedding + training
# TfidfVectorizer collects a document of TF-IDF features, used in text classification: https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py

pip = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',       # remove short, meaningless words
        ngram_range=(1,2)        # to make dict smaller
    )),
    ('nb', MultinomialNB())
])

In [None]:
# train: reddit, test: twitter
pip.fit(X_train1, y_train1)
y_pred1 = pip.predict(X_test1)
print("Accuracy:", accuracy_score(y_test1, y_pred1))
print(classification_report(y_test1, y_pred1))

Accuracy: 0.6658092788415738
              precision    recall  f1-score   support

           0       0.87      0.34      0.49      4924
           1       0.62      0.95      0.75      5573

    accuracy                           0.67     10497
   macro avg       0.74      0.65      0.62     10497
weighted avg       0.74      0.67      0.63     10497



In [None]:
# train: twitter, test: reddit
pip.fit(X_train2, y_train2)
y_pred2 = pip.predict(X_test2)
print("Accuracy:", accuracy_score(y_test2, y_pred2))
print(classification_report(y_test2, y_pred2))

Accuracy: 0.7998372660699756
              precision    recall  f1-score   support

           0       0.87      0.54      0.67      3189
           1       0.78      0.95      0.86      5414

    accuracy                           0.80      8603
   macro avg       0.83      0.75      0.76      8603
weighted avg       0.81      0.80      0.79      8603



In [None]:
# train: 80%, test: 20%
pip.fit(X_train3, y_train3)
y_pred3 = pip.predict(X_test3)
print("Accuracy:", accuracy_score(y_test3, y_pred3))
print(classification_report(y_test3, y_pred3))

Accuracy: 0.8115183246073299
              precision    recall  f1-score   support

           0       0.91      0.62      0.74      1630
           1       0.77      0.96      0.85      2190

    accuracy                           0.81      3820
   macro avg       0.84      0.79      0.79      3820
weighted avg       0.83      0.81      0.80      3820



- Third dataset split peformed the best, receiving highest accuracy and f1-score (or tie with second model)
- Twitter data as a dataset performed significantly better than reddit data, suggesting the quality of twitter data is better
    - Twitter data generalizes better to new observations
    - Twitter data itself is almost as good in quality than 80% of all datasets, suggesting twitter data could be used as it's own in model training


# SVM and Logstic Regression

### Compare models and two embeddings: TF-IDF and Doc2Vec
- TF-IDF creates matrix of tf-idf features (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
- Doc2Vec learns embeddings of words, numerical vectors (https://www.geeksforgeeks.org/nlp/doc2vec-in-nlp/)

### Training and testing data
- Use same data splits as in NBC

In [33]:
# TF-IDF + Logistic regression
TF_IDF_log_pip = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',       # remove short, meaningless words
        ngram_range=(1,2)        # to make dict smaller
    )),
    ('nb', LogisticRegression())
])

In [32]:
# TF-IDF + SVM
TF_IDF_svm_pip = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',       # remove short, meaningless words
        ngram_range=(1,2)        # to make dict smaller
    )),
    ('nb', svm.SVC())
])

### Doc2Vec processing 

In [None]:
def doc2vec(X, y, model):
    tagged_train = [TaggedDocument(words=word_tokenize(doc.lower()),
                                  tages=[str(i)]) for i,
                                  doc in enumerate(X)]