# Import Dependencies

In [1]:
import os
import json
import pandas as pd
from main_model.scripts.data_preprocessing import preprocess_data

# Directories

# Import the final dataset

In [2]:
final_df = pd.read_csv("data/final_dataset.csv")

  final_df = pd.read_csv("data/final_dataset.csv")


## Data Preprocessing

In [3]:
final_df = preprocess_data(final_df)

In [4]:
from sklearn.preprocessing import LabelEncoder

labels = final_df['rating']

label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

In [5]:
print(final_df)

                                                      url  \
0                               http://www.nfib-sbet.org/   
1       http://www.cq.com/doc/newsmakertranscripts-494...   
2       https://web.archive.org/web/20080204072132/htt...   
3       https://web.archive.org/web/20110811143753/htt...   
4       https://web.archive.org/web/20070820164107/htt...   
...                                                   ...   
129200  www.huffingtonpost.com/2012/09/11/september-11...   
129201  www.dailymail.co.uk/news/article-4915674/NASCA...   
129202  www.telegraph.co.uk/men/the-filter/7-signs-dav...   
129203  www.vanityfair.com/style/2016/09/ryan-gosling-...   
129204  www.lifeandstylemag.com/posts/jamie-foxx-katie...   

                                                    title category account  \
0             national federation of independent business      nan     nan   
1                             comments in fayetteville nc      nan     nan   
2       romney makes pitch hoping

## POS Tagging

In [6]:
from main_model.scripts.postagging import postagging

unigram_tagged, bigram_tagged, trigram_tagged = postagging(final_df)

[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\bneer\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bneer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Lemmatization

In [7]:
from main_model.scripts.lemmatization import RadixTree

dictionary = list(set(word for sentence in final_df['tokenized_title'] for word in sentence))
radix_tree = RadixTree()
for word in dictionary:
    radix_tree.insert(word)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bneer\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\bneer\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
from main_model.scripts.lemmatization import perform_lemmatization

unigram_lemmatization_results = perform_lemmatization(unigram_tagged, dictionary, radix_tree)
bigram_lemmatization_results = perform_lemmatization(bigram_tagged, dictionary, radix_tree)
trigram_lemmatization_results = perform_lemmatization(trigram_tagged, dictionary, radix_tree)

In [9]:
from main_model.scripts.feature_extraction import tfidf_vectorize, bow_vectorize, ngrams_vectorize

lemmatization_results = {
    "unigram": unigram_lemmatization_results,
    "bigram": bigram_lemmatization_results,
    "trigram": trigram_lemmatization_results,
}

vectorizers = {
    "tfidf": tfidf_vectorize,
    "bow": bow_vectorize,
    "ngrams": lambda s: ngrams_vectorize(s, n=2),
}

## Feature Extraction

In [10]:
from main_model.scripts.feature_extraction import train_and_evaluate

results = {}

for tagger_name, lemmatizations in lemmatization_results.items():
    results[tagger_name] = {}
    for lemma_name, lemmatized_sentences in lemmatizations.items():
        results[tagger_name][lemma_name] = {}
        for vectorizer_name, vectorizer_func in vectorizers.items():
            print(f"Training {vectorizer_name} on {tagger_name} output with {lemma_name} lemmatization...")
            features = vectorizer_func(lemmatized_sentences)
            accuracy, precision, recall, f1 = train_and_evaluate(features, labels)
            results[tagger_name][lemma_name][vectorizer_name] = {
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "f1": f1
            }
            print(f"Results: Accuracy={accuracy}, Precision={precision}, Recall={recall}, F1={f1}")

print(results)

Training tfidf on unigram output with radix lemmatization...
Results: Accuracy=0.6109089470769717, Precision=0.6091201597536658, Recall=0.5234339744401167, F1=0.5338585217192766
Training bow on unigram output with radix lemmatization...
Results: Accuracy=0.6107902855786725, Precision=0.6113663717625579, Recall=0.526070220747978, F1=0.5376530178739494
Training ngrams on unigram output with radix lemmatization...
Results: Accuracy=0.4806186219444664, Precision=0.548328764891927, Recall=0.3704299127080421, F1=0.35936891670726995
Training tfidf on unigram output with morphological lemmatization...
Results: Accuracy=0.6156949608417056, Precision=0.6081125940161588, Recall=0.5291388294247683, F1=0.5397002183438828
Training bow on unigram output with morphological lemmatization...
Results: Accuracy=0.6150225456846768, Precision=0.6105218082609154, Recall=0.5313543817612736, F1=0.5429067829907537
Training ngrams on unigram output with morphological lemmatization...
Results: Accuracy=0.48651214