# Contents
- Download data
- Create full_data columns
- Create train and datasets
- Train models
    - Multinomial Naive Bayesian
    - Logistic Regression and SVM
    - Transform

In [125]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from gensim.models.doc2vec import Doc2Vec,\
    TaggedDocument
from nltk.tokenize import word_tokenize


In [126]:
is_train = False

### Get the data

In [127]:
reddit_combi_df = pd.read_csv(
    "cleaned_data/Reddit_Combi_cleaned.csv",   
    )

reddit_title_df = pd.read_csv(    
    "cleaned_data/Reddit_Title_cleaned.csv",    
)

twitter_full_df = pd.read_csv(
    "cleaned_data/Twitter_Full_cleaned.csv",  
    )

twitter_non_advert = pd.read_csv(
    "cleaned_data/Twitter_Non-Advert_cleaned.csv",
    )

df_files = [reddit_combi_df, reddit_title_df, twitter_full_df, twitter_non_advert]

In [128]:
for df in df_files:
    print(df.columns)

Index(['title', 'body', 'Body_Title', 'label'], dtype='object')
Index(['title', 'label'], dtype='object')
Index(['text', 'hashtags', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')


In [129]:
# Rename column name for consistency
df_files[0] = df_files[0].rename(columns={"Body_Title": "body_title"})
df_files[0].columns

Index(['title', 'body', 'body_title', 'label'], dtype='object')

In [130]:
# merge columns in dataframed containing text
df_files[0]["full_text"] = (
    df_files[0]["title"].fillna("") + " " +
    df_files[0]["body"].fillna("") + " " +
    df_files[0]["body_title"].fillna("") + " " 
)

df_files[1]["full_text"] = (
    df_files[1]["title"].fillna("") + " "
)

df_files[2]["full_text"] = (
    df_files[2]["text"].fillna("") + " " +
    df_files[2]["hashtags"].astype(str).fillna("")
)

df_files[3]["full_text"] = (
    df_files[3]["text"].fillna("") + " " 
)


In [131]:
# clean hashtag symbols from full text, hashtags should only be in Twitter full, but clean from all to maintain consistency
def clean_text(text):
    text = str(text) 
    text = re.sub(r"[\[\]']", "", text)  
    text = text.replace("#", "") 
    return text

for df in df_files:
    df["full_text"] = df["full_text"].apply(clean_text)

In [132]:
for df in df_files:
    print(df.columns)

Index(['title', 'body', 'body_title', 'label', 'full_text'], dtype='object')
Index(['title', 'label', 'full_text'], dtype='object')
Index(['text', 'hashtags', 'label', 'full_text'], dtype='object')
Index(['text', 'label', 'full_text'], dtype='object')


In [133]:
# check full_text is ok

df_files[0].head()

Unnamed: 0,title,body,body_title,label,full_text
0,envy to other is swallowing me,"im from developingcountry, indonesia , and for...",envy to other is swallowing me im from develop...,1,envy to other is swallowing me im from develop...
1,nothin outta the ordinary. paradise. job stres...,um hello .well many can relate im sure. after ...,nothin outta the ordinary. paradise. job stres...,1,nothin outta the ordinary. paradise. job stres...
2,almost 49 and the chasm of emptiness has never...,i’ve been diagnosed severe bi polar where you ...,almost 49 and the chasm of emptiness has never...,1,almost 49 and the chasm of emptiness has never...
3,i’m happy again,"after my closest friend left me in april, i ha...",i’m happy again after my closest friend left m...,0,i’m happy again after my closest friend left m...
4,is it possible to recover from such a traumati...,"i am only 15, and yet i feel my life is alread...",is it possible to recover from such a traumati...,1,is it possible to recover from such a traumati...


### Merge dataframes, so they contain only one feature column "full_text"

In [134]:
# create new dataframed containing only full_text and label for all dataframes

reddit1 = df_files[0].copy()
reddit2 = df_files[1].copy()
twitter1 = df_files[2].copy()
twitter2 = df_files[3].copy()

reddit1 = reddit1.drop(columns=['title', 'body', 'body_title'])
reddit2 = reddit2.drop(columns=['title'])
twitter1 = twitter1.drop(columns=['text', 'hashtags'])
twitter2 = twitter2.drop(columns=['text'])

df_files_new = [reddit1, reddit2, twitter1, twitter2]


In [135]:
for df in df_files_new:
    print(len(df))
    print(df.columns)

3123
Index(['label', 'full_text'], dtype='object')
5480
Index(['label', 'full_text'], dtype='object')
8525
Index(['label', 'full_text'], dtype='object')
1972
Index(['label', 'full_text'], dtype='object')


In [136]:
df_files_new[0].head()

Unnamed: 0,label,full_text
0,1,envy to other is swallowing me im from develop...
1,1,nothin outta the ordinary. paradise. job stres...
2,1,almost 49 and the chasm of emptiness has never...
3,0,i’m happy again after my closest friend left m...
4,1,is it possible to recover from such a traumati...


In [190]:
# Create dataframe to summarize accuracies
accuracy_df = pd.DataFrame(columns=[
    "dataset_split",   
    "model",          
    "representation",  
    "accuracy"         
])

## Training data

- Three models are trained and data splits are the following
    - training: reddit, test and validation: twitter
        - how different datasets generalize
    - training: twitter, test and validation: reddit
        - how different datasets generalize
    - training: 80% of all datasets, test and validation: 20% of all datasets
        - typical ML data split

In [138]:
# Split data as above

X_train1 = pd.concat([reddit1, reddit2])["full_text"]
y_train1 = pd.concat([reddit1, reddit2])["label"]
X_test1 = pd.concat([twitter1, twitter2])["full_text"]
y_test1 = pd.concat([twitter1, twitter2])["label"]

X_train2 = pd.concat([twitter1, twitter2])["full_text"]
y_train2 = pd.concat([twitter1, twitter2])["label"]
X_test2 = pd.concat([reddit1, reddit2])["full_text"]
y_test2 = pd.concat([reddit1, reddit2])["label"]

X_3 = pd.concat([reddit1, reddit2, twitter1, twitter2])["full_text"]
y_3 = pd.concat([reddit1, reddit2, twitter1, twitter2])["label"]

X_train3, X_test3, y_train3, y_test3 = train_test_split(X_3, y_3, test_size=0.2, random_state=42)

In [139]:
# check some lengths
print(len(X_train1))
print(len(X_test1))
print(len(X_train3))

8603
10497
15280


## Multinomial Naive Bayesian

- Is selected as a model, because it's widely used in text classification tasks (https://towardsdatascience.com/multinomial-naive-bayes-for-documents-classification-and-natural-language-processing-nlp-e08cc848ce6/)


In [140]:
# Pipeline for embedding + training
# TfidfVectorizer collects a document of TF-IDF features, used in text classification: https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html#sphx-glr-auto-examples-text-plot-document-classification-20newsgroups-py

pip = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',       # remove short, meaningless words
        ngram_range=(1,2)        # to make dict smaller
    )),
    ('nb', MultinomialNB())
])

In [None]:
# train: reddit, test: twitter
pip.fit(X_train1, y_train1)
y_pred1 = pip.predict(X_test1)
acc_MNB_1 = accuracy_score(y_test1, y_pred1)
print("Accuracy:", acc_MNB_1)
print(classification_report(y_test1, y_pred1))


Accuracy: 0.6674287891778603
              precision    recall  f1-score   support

           0       0.87      0.34      0.49      4924
           1       0.62      0.95      0.75      5573

    accuracy                           0.67     10497
   macro avg       0.75      0.65      0.62     10497
weighted avg       0.74      0.67      0.63     10497



In [191]:
accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Reddit / test: Twitter",
        "model": "MultinomialNB",
        "representation": "TF-IDF",
        "accuracy": acc_MNB_1
    }])
], ignore_index=True)

  accuracy_df = pd.concat([


In [None]:
# train: twitter, test: reddit
pip.fit(X_train2, y_train2)
y_pred2 = pip.predict(X_test2)
acc_MNB_2 = accuracy_score(y_test2, y_pred2)
print("Accuracy:", acc_MNB_2)
print(classification_report(y_test2, y_pred2))



Accuracy: 0.8000697431128676
              precision    recall  f1-score   support

           0       0.88      0.53      0.66      3189
           1       0.78      0.96      0.86      5414

    accuracy                           0.80      8603
   macro avg       0.83      0.75      0.76      8603
weighted avg       0.81      0.80      0.79      8603



In [193]:
accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Twitter / test: Reddit",
        "model": "MultinomialNB",
        "representation": "TF-IDF",
        "accuracy": acc_MNB_2
    }])
], ignore_index=True)


In [None]:
# train: 80%, test: 20%
pip.fit(X_train3, y_train3)
y_pred3 = pip.predict(X_test3)
acc_MNB_3 = accuracy_score(y_test3, y_pred3)
print("Accuracy:", acc_MNB_3)
print(classification_report(y_test3, y_pred3))


Accuracy: 0.8109947643979057
              precision    recall  f1-score   support

           0       0.91      0.62      0.74      1630
           1       0.77      0.96      0.85      2190

    accuracy                           0.81      3820
   macro avg       0.84      0.79      0.79      3820
weighted avg       0.83      0.81      0.80      3820



In [194]:
accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train/test 80/20 split",
        "model": "MultinomialNB",
        "representation": "TF-IDF",
        "accuracy": acc_MNB_3
    }])
], ignore_index=True)


In [195]:
accuracy_df

Unnamed: 0,dataset_split,model,representation,accuracy
0,train: Reddit / test: Twitter,MultinomialNB,TF-IDF,0.667429
1,train: Twitter / test: Reddit,MultinomialNB,TF-IDF,0.80007
2,train/test 80/20 split,MultinomialNB,TF-IDF,0.810995


- Third dataset split peformed the best, receiving highest accuracy and f1-score (or tie with second model)
- Twitter data as a dataset performed significantly better than reddit data, suggesting the quality of twitter data is better
    - Twitter data generalizes better to new observations
    - Twitter data itself is almost as good in quality than 80% of all datasets, suggesting twitter data could be used as it's own in model training


# SVM and Logstic Regression

### Compare models and two embeddings: TF-IDF and Doc2Vec
- TF-IDF creates matrix of tf-idf features (https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
- Doc2Vec learns embeddings of words, numerical vectors (https://www.geeksforgeeks.org/nlp/doc2vec-in-nlp/)

### Training and testing data
- Use same data splits as in NBC

In [145]:
# TF-IDF + Logistic regression
TF_IDF_log_pip = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',       # remove short, meaningless words
        ngram_range=(1,2)        # to make dict smaller
    )),
    ('nb', LogisticRegression())
])

In [146]:
# TF-IDF + SVC
TF_IDF_svm_pip = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',       # remove short, meaningless words
        ngram_range=(1,2)        # to make dict smaller
    )),
    ('nb', svm.SVC())
])

### Doc2Vec processing 
- Vectorize words only once, because it takes over one minute

In [147]:
def tokenize(text):
    return str(text).lower().split()

def doc2vec(X_train, X_test):
    tagged_train = [TaggedDocument(words=tokenize(doc), tags=[str(i)]) for i, doc in enumerate(X_train)]

    # train Doc2Vev
    model = Doc2Vec(vector_size=20, min_count=2, epochs=50)
    model.build_vocab(tagged_train)
    model.train(tagged_train, total_examples=model.corpus_count, epochs=model.epochs)

    # get document vectors
    train_vec = [model.dv[str(i)] for i in range(len(tagged_train))]
    test_vec = [model.infer_vector(tokenize(doc)) for doc in X_test]

    return np.array(train_vec), np.array(test_vec)

In [148]:
X_train_1_d2v, X_test_1_d2v = doc2vec(X_train1.astype(str).tolist(), X_test1.astype(str).tolist())

In [149]:
X_train_2_d2v, X_test_2_d2v = doc2vec(X_train2.astype(str).tolist(), X_test2.astype(str).tolist())

In [150]:
X_train_3_d2v, X_test_3_d2v = doc2vec(X_train3.astype(str).tolist(), X_test3.astype(str).tolist())

### TF-IDF tokenizer

In [151]:
# train: reddit, test: twitter
TF_IDF_log_pip.fit(X_train1, y_train1)
y_pred1_1 = TF_IDF_log_pip.predict(X_test1)

TF_IDF_svm_pip.fit(X_train1, y_train1)
y_pred1_2 = TF_IDF_svm_pip.predict(X_test1)

In [196]:
# Logistic Regression
acc_lr = accuracy_score(y_test1, y_pred1_1)
print("Logistic Regression")
print("Accuracy:", acc_lr)
print(classification_report(y_test1, y_pred1_1))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Reddit / test: Twitter",
        "model": "Logistic Regression",
        "representation": "TF-IDF",
        "accuracy": acc_lr
    }])
], ignore_index=True)

# SVM
acc_svc = accuracy_score(y_test1, y_pred1_2)
print("SVM")
print("Accuracy:", acc_svc)
print(classification_report(y_test1, y_pred1_2))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Reddit / test: Twitter",
        "model": "SVM",
        "representation": "TF-IDF",
        "accuracy": acc_svc
    }])
], ignore_index=True)


Logistic Regression
Accuracy: 0.7370677336381823
              precision    recall  f1-score   support

           0       0.74      0.69      0.71      4924
           1       0.74      0.78      0.76      5573

    accuracy                           0.74     10497
   macro avg       0.74      0.73      0.73     10497
weighted avg       0.74      0.74      0.74     10497

SVM
Accuracy: 0.7304944269791369
              precision    recall  f1-score   support

           0       0.71      0.71      0.71      4924
           1       0.75      0.75      0.75      5573

    accuracy                           0.73     10497
   macro avg       0.73      0.73      0.73     10497
weighted avg       0.73      0.73      0.73     10497



In [153]:
# train: twitter, test: reddit
TF_IDF_log_pip.fit(X_train2, y_train2)
y_pred2_1 = TF_IDF_log_pip.predict(X_test2)

TF_IDF_svm_pip.fit(X_train2, y_train2)
y_pred2_2 = TF_IDF_svm_pip.predict(X_test2)

In [197]:
# Logistic Regression
acc_lr2 = accuracy_score(y_test2, y_pred2_1)
print("Logistic Regression")
print("Accuracy:", acc_lr2)
print(classification_report(y_test2, y_pred2_1))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Twitter / test: Reddit",
        "model": "Logistic Regression",
        "representation": "TF-IDF",
        "accuracy": acc_lr2
    }])
], ignore_index=True)

# SVM
acc_svc2 = accuracy_score(y_test2, y_pred2_2)
print("SVM")
print("Accuracy:", acc_svc2)
print(classification_report(y_test2, y_pred2_2))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Twitter / test: Reddit",
        "model": "SVM",
        "representation": "TF-IDF",
        "accuracy": acc_svc2
    }])
], ignore_index=True)

Logistic Regression
Accuracy: 0.7927467162617692
              precision    recall  f1-score   support

           0       0.73      0.71      0.72      3189
           1       0.83      0.84      0.84      5414

    accuracy                           0.79      8603
   macro avg       0.78      0.77      0.78      8603
weighted avg       0.79      0.79      0.79      8603

SVM
Accuracy: 0.8029757061490178
              precision    recall  f1-score   support

           0       0.79      0.64      0.71      3189
           1       0.81      0.90      0.85      5414

    accuracy                           0.80      8603
   macro avg       0.80      0.77      0.78      8603
weighted avg       0.80      0.80      0.80      8603



In [155]:
# train: 80&, test: 20%
TF_IDF_log_pip.fit(X_train3, y_train3)
y_pred3_1 = TF_IDF_log_pip.predict(X_test3)

TF_IDF_svm_pip.fit(X_train3, y_train3)
y_pred3_2 = TF_IDF_svm_pip.predict(X_test3)

In [198]:
# Logistic Regression
acc_lr3 = accuracy_score(y_test3, y_pred3_1)
print("Logistic Regression")
print("Accuracy:", acc_lr3)
print(classification_report(y_test3, y_pred3_1))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train/test 80/20 split",
        "model": "Logistic Regression",
        "representation": "TF-IDF",
        "accuracy": acc_lr3
    }])
], ignore_index=True)

# SVM
acc_svc3 = accuracy_score(y_test3, y_pred3_2)
print("SVM")
print("Accuracy:", acc_svc3)
print(classification_report(y_test3, y_pred3_2))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train/test 80/20 split",
        "model": "SVM",
        "representation": "TF-IDF",
        "accuracy": acc_svc3
    }])
], ignore_index=True)


Logistic Regression
Accuracy: 0.8667539267015707
              precision    recall  f1-score   support

           0       0.88      0.79      0.84      1630
           1       0.86      0.92      0.89      2190

    accuracy                           0.87      3820
   macro avg       0.87      0.86      0.86      3820
weighted avg       0.87      0.87      0.87      3820

SVM
Accuracy: 0.881413612565445
              precision    recall  f1-score   support

           0       0.89      0.82      0.86      1630
           1       0.88      0.92      0.90      2190

    accuracy                           0.88      3820
   macro avg       0.88      0.87      0.88      3820
weighted avg       0.88      0.88      0.88      3820



In [199]:
accuracy_df

Unnamed: 0,dataset_split,model,representation,accuracy
0,train: Reddit / test: Twitter,MultinomialNB,TF-IDF,0.667429
1,train: Twitter / test: Reddit,MultinomialNB,TF-IDF,0.80007
2,train/test 80/20 split,MultinomialNB,TF-IDF,0.810995
3,train: Reddit / test: Twitter,Logistic Regression,TF-IDF,0.737068
4,train: Reddit / test: Twitter,SVM,TF-IDF,0.730494
5,train: Twitter / test: Reddit,Logistic Regression,TF-IDF,0.792747
6,train: Twitter / test: Reddit,SVM,TF-IDF,0.802976
7,train/test 80/20 split,Logistic Regression,TF-IDF,0.866754
8,train/test 80/20 split,SVM,TF-IDF,0.881414


## Doc2Vec Tokenizer

In [158]:
log_reg_model = LogisticRegression()
SVC_model = svm.SVC()

log_reg_model_2 = LogisticRegression()
SVC_model_2 = svm.SVC()

log_reg_model_3 = LogisticRegression()
SVC_model_3 = svm.SVC()

### Training data: Reddit, testing data: Twitter

In [159]:
log_reg_model.fit(X_train_1_d2v, y_train1)
y_pred1_lr_d2v = log_reg_model.predict(X_test_1_d2v) 

SVC_model.fit(X_train_1_d2v, y_train1)
y_pred1_svc_d2v = SVC_model.predict(X_test_1_d2v) 

In [200]:
# Logistic Regression
acc_lr_d2v1 = accuracy_score(y_test1, y_pred1_lr_d2v)
print("Logistic Regression")
print("Accuracy:", acc_lr_d2v1)
print(classification_report(y_test1, y_pred1_lr_d2v))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Reddit / test: Twitter",
        "model": "Logistic Regression",
        "representation": "Doc2Vec",
        "accuracy": acc_lr_d2v1
    }])
], ignore_index=True)

# SVM
acc_svc_d2v1 = accuracy_score(y_test1, y_pred1_svc_d2v)
print("SVM")
print("Accuracy:", acc_svc_d2v1)
print(classification_report(y_test1, y_pred1_svc_d2v))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Reddit / test: Twitter",
        "model": "SVM",
        "representation": "Doc2Vec",
        "accuracy": acc_svc_d2v1
    }])
], ignore_index=True)

Logistic Regression
Accuracy: 0.6133180908831095
              precision    recall  f1-score   support

           0       0.57      0.74      0.64      4924
           1       0.68      0.50      0.58      5573

    accuracy                           0.61     10497
   macro avg       0.63      0.62      0.61     10497
weighted avg       0.63      0.61      0.61     10497

SVM
Accuracy: 0.5997904163094218
              precision    recall  f1-score   support

           0       0.55      0.81      0.65      4924
           1       0.71      0.42      0.53      5573

    accuracy                           0.60     10497
   macro avg       0.63      0.61      0.59     10497
weighted avg       0.63      0.60      0.59     10497



### Training data: Twitter, testing data: Reddit

In [161]:
log_reg_model_2.fit(X_train_2_d2v, y_train2)
y_pred2_lr_d2v = log_reg_model_2.predict(X_test_2_d2v) 

SVC_model_2.fit(X_train_2_d2v, y_train2)
y_pred2_svc_d2v = SVC_model_2.predict(X_test_2_d2v) 

In [201]:
# Logistic Regression
acc_lr_d2v2 = accuracy_score(y_test2, y_pred2_lr_d2v)
print("Logistic Regression")
print("Accuracy:", acc_lr_d2v2)
print(classification_report(y_test2, y_pred2_lr_d2v))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Twitter / test: Reddit",
        "model": "Logistic Regression",
        "representation": "Doc2Vec",
        "accuracy": acc_lr_d2v2
    }])
], ignore_index=True)

# SVM
acc_svc_d2v2 = accuracy_score(y_test2, y_pred2_svc_d2v)
print("SVM")
print("Accuracy:", acc_svc_d2v2)
print(classification_report(y_test2, y_pred2_svc_d2v))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train: Twitter / test: Reddit",
        "model": "SVM",
        "representation": "Doc2Vec",
        "accuracy": acc_svc_d2v2
    }])
], ignore_index=True)

Logistic Regression
Accuracy: 0.7718237824014879
              precision    recall  f1-score   support

           0       0.73      0.61      0.67      3189
           1       0.79      0.87      0.83      5414

    accuracy                           0.77      8603
   macro avg       0.76      0.74      0.75      8603
weighted avg       0.77      0.77      0.77      8603

SVM
Accuracy: 0.5500406834825061
              precision    recall  f1-score   support

           0       0.43      0.64      0.51      3189
           1       0.70      0.50      0.58      5414

    accuracy                           0.55      8603
   macro avg       0.56      0.57      0.55      8603
weighted avg       0.60      0.55      0.56      8603



### Training data: 80%, testing data: 20%

In [163]:
log_reg_model_3.fit(X_train_3_d2v, y_train3)
y_pred3_lr_d2v = log_reg_model_3.predict(X_test_3_d2v) 

SVC_model_3.fit(X_train_3_d2v, y_train3)
y_pred3_svc_d2v = SVC_model_3.predict(X_test_3_d2v) 

In [202]:
# Logistic Regression
acc_lr_d2v3 = accuracy_score(y_test3, y_pred3_lr_d2v)
print("Logistic Regression")
print("Accuracy:", acc_lr_d2v3)
print(classification_report(y_test3, y_pred3_lr_d2v))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train/test 80/20 split",
        "model": "Logistic Regression",
        "representation": "Doc2Vec",
        "accuracy": acc_lr_d2v3
    }])
], ignore_index=True)

# SVM
acc_svc_d2v3 = accuracy_score(y_test3, y_pred3_svc_d2v)
print("SVM")
print("Accuracy:", acc_svc_d2v3)
print(classification_report(y_test3, y_pred3_svc_d2v))

accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train/test 80/20 split",
        "model": "SVM",
        "representation": "Doc2Vec",
        "accuracy": acc_svc_d2v3
    }])
], ignore_index=True)

Logistic Regression
Accuracy: 0.7473821989528796
              precision    recall  f1-score   support

           0       0.73      0.65      0.69      1630
           1       0.76      0.82      0.79      2190

    accuracy                           0.75      3820
   macro avg       0.74      0.73      0.74      3820
weighted avg       0.75      0.75      0.75      3820

SVM
Accuracy: 0.7832460732984293
              precision    recall  f1-score   support

           0       0.77      0.70      0.73      1630
           1       0.79      0.85      0.82      2190

    accuracy                           0.78      3820
   macro avg       0.78      0.77      0.78      3820
weighted avg       0.78      0.78      0.78      3820



In [203]:
accuracy_df

Unnamed: 0,dataset_split,model,representation,accuracy
0,train: Reddit / test: Twitter,MultinomialNB,TF-IDF,0.667429
1,train: Twitter / test: Reddit,MultinomialNB,TF-IDF,0.80007
2,train/test 80/20 split,MultinomialNB,TF-IDF,0.810995
3,train: Reddit / test: Twitter,Logistic Regression,TF-IDF,0.737068
4,train: Reddit / test: Twitter,SVM,TF-IDF,0.730494
5,train: Twitter / test: Reddit,Logistic Regression,TF-IDF,0.792747
6,train: Twitter / test: Reddit,SVM,TF-IDF,0.802976
7,train/test 80/20 split,Logistic Regression,TF-IDF,0.866754
8,train/test 80/20 split,SVM,TF-IDF,0.881414
9,train: Reddit / test: Twitter,Logistic Regression,Doc2Vec,0.613318


# Transformer

- https://huggingface.co/docs/transformers/tasks/sequence_classification

In [166]:
import torch
from datasets import Dataset, concatenate_datasets
import evaluate

In [167]:
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [168]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /distilbert/distilbert-base-uncased/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A4B5853710>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 91b440b7-231f-49a3-a26c-66aee21ed136)')' thrown while requesting HEAD https://huggingface.co/distilbert/distilbert-base-uncased/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /distilbert/distilbert-base-uncased/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A49212DFD0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: d017e8d3-e70e-42eb-ba95-fe06ed56d8d9)')' thrown while requesting HEAD https://huggingface.co/distilbert/distilbert-base-uncased/resolve/main/tokenizer_config.json
Retrying in 2s [Retry 2/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /distilbert/distilbert-base-uncased/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A49213AF60>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: f695e00d-ae00-46ce-b8c7-ae345b4e2a88)')' thrown while requesting HEAD https:/

- set max_length = 128 to speed up training

In [169]:
def format_data(dataframe):
    dataframe = dataframe.rename(columns={'full_text': 'text'})
    dataset = Dataset.from_pandas(dataframe)
    return dataset

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

In [170]:
# Apply data formatting to train and test sets
datasets = []
for df in df_files_new:
    datasets.append(format_data(df))
    
# combine datasets and split into train and tests sets as before
full_dataset = concatenate_datasets(datasets)
full_dataset = full_dataset.shuffle(seed=42)
full_dataset = full_dataset.map(preprocess_function, batched=True)

dataset = full_dataset.train_test_split(test_size=0.2)
train_ds = dataset["train"]
test_ds = dataset["test"]

Map:   0%|          | 0/19100 [00:00<?, ? examples/s]

In [171]:
# ceck format
train_ds[0]

{'label': 1,
 'text': 'homework is taking up my life i am a student who was online for the first 4 months of school, and have been in school a week now. i get 6-8 hours of homework a night! i don’t know how to manage it on top of other stuff going on in my life! any advice? homework is taking up my life i am a student who was online for the first 4 months of school, and have been in school a week now. i get 6-8 hours of homework a night! i don’t know how to manage it on top of other stuff going on in my life! any advice? ',
 'input_ids': [101,
  19453,
  2003,
  2635,
  2039,
  2026,
  2166,
  1045,
  2572,
  1037,
  3076,
  2040,
  2001,
  3784,
  2005,
  1996,
  2034,
  1018,
  2706,
  1997,
  2082,
  1010,
  1998,
  2031,
  2042,
  1999,
  2082,
  1037,
  2733,
  2085,
  1012,
  1045,
  2131,
  1020,
  1011,
  1022,
  2847,
  1997,
  19453,
  1037,
  2305,
  999,
  1045,
  2123,
  1521,
  1056,
  2113,
  2129,
  2000,
  6133,
  2009,
  2006,
  2327,
  1997,
  2060,
  4933,
  2183,
 

In [172]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Evaluation

In [173]:
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

Using the latest cached version of the module from C:\Users\oonas\.cache\huggingface\modules\evaluate_modules\metrics\evaluate-metric--accuracy\f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Sun Nov 23 09:37:44 2025) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


### Training

In [174]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [175]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /distilbert/distilbert-base-uncased/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A4B829E4E0>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 55af2dcb-8266-4e94-8aa6-6d0fa4b4ef8b)')' thrown while requesting HEAD https://huggingface.co/distilbert/distilbert-base-uncased/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /distilbert/distilbert-base-uncased/resolve/main/config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001A4A82FEF90>: Failed to resolve \'huggingface.co\' ([Errno 11001] getaddrinfo failed)"))'), '(Request ID: 08f6d7e6-3d9a-4286-9b4e-49f8c95dc376)')' thrown while requesting HEAD https://huggingface.co/distilbert/dis

In [176]:
if is_train:
    training_args = TrainingArguments(
        output_dir="Transformer",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=1,
        weight_decay=0.0,
        #evaluation_strategy="no",           
        save_strategy="no",               
        logging_steps=50,                        
        report_to="none",                     
        push_to_hub=False,
        no_cuda=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        processing_class=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

In [177]:
if is_train:
    trainer.model.save_pretrained("Transformer_backup")
    tokenizer.save_pretrained("Transformer_backup")

In [180]:
model = AutoModelForSequenceClassification.from_pretrained("Transformer_backup")
tokenizer = AutoTokenizer.from_pretrained("Transformer_backup")

new_trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_ds,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [181]:
eval_result = new_trainer.evaluate()
print("Accuracy:", eval_result["eval_accuracy"])


Accuracy: 0.9683246073298429


In [204]:
accuracy_df = pd.concat([
    accuracy_df,
    pd.DataFrame([{
        "dataset_split": "train/test 80/20 split",
        "model": "Transformer",
        "representation": "Tokenizer",
        "accuracy": eval_result["eval_accuracy"]
    }])
], ignore_index=True)

In [205]:
accuracy_df

Unnamed: 0,dataset_split,model,representation,accuracy
0,train: Reddit / test: Twitter,MultinomialNB,TF-IDF,0.667429
1,train: Twitter / test: Reddit,MultinomialNB,TF-IDF,0.80007
2,train/test 80/20 split,MultinomialNB,TF-IDF,0.810995
3,train: Reddit / test: Twitter,Logistic Regression,TF-IDF,0.737068
4,train: Reddit / test: Twitter,SVM,TF-IDF,0.730494
5,train: Twitter / test: Reddit,Logistic Regression,TF-IDF,0.792747
6,train: Twitter / test: Reddit,SVM,TF-IDF,0.802976
7,train/test 80/20 split,Logistic Regression,TF-IDF,0.866754
8,train/test 80/20 split,SVM,TF-IDF,0.881414
9,train: Reddit / test: Twitter,Logistic Regression,Doc2Vec,0.613318


In [206]:
accuracy_df.to_csv('accuracies.csv', index=False)