# Instructions/Notes
This notebook is sectioned into pre-processing and classification. To get results, run all cells

Note: The "Results" section at the bottom of the notebook contains LaTeX table data & is not part of the code to run.

# Imports & Setup

## Imports

In [1]:
# Data handling
import pandas as pd
import numpy as np

# Misc
import matplotlib.pyplot as plt
import pickle
import copy

# Model(s)
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

# NLP
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Helper functions
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Helper Functions

In [3]:
def return_predict_acc(model, X, y):
    y_pred = model.predict(X)
    return accuracy_score(y, y_pred)

In [4]:
def train_unsupervised_models(estimator, train_dict, test_dict, output_path=None):
    '''
    Trains an unsupervised classification model on multiple training/testing data sets

    Args:
    - estimator: model to be trained
    - train_dict (dict): dictionary of training data
    - test_dict (dict): dictionary of training data
    - output_path=None (str): location to save the best model to

    The training and testing dictionaries are of the form
    'data key': (X, y)
    where X corresponds to features to be trained on, and y corresponds to labels
    '''

    # Print which model we're training on
    model_type = estimator.__class__.__name__
    print(model_type)

    # Variables for saving best model
    best_acc = 0.0
    best_estimator = None

    for (train_title, (X_train, y_train)), (test_title, (X_test, y_test)) in zip(train_dict.items(), test_dict.items()):
        # Ensure the correct data is being trained/tested together
        assert train_title == test_title, 'Data is mismatched for train/test split'

        # Train model and gather accuracies
        estimator.fit(X_train)
        acc_train = return_predict_acc(estimator, X_train, y_train)
        acc_test = return_predict_acc(estimator, X_test, y_test)

        # Flip acc's if under baseline
        acc_train = max(acc_train, 1-acc_train)
        acc_test = max(acc_test, 1-acc_test)

        # Update best model
        if acc_test > best_acc:
            best_acc = acc_test
            best_estimator = copy.deepcopy(estimator)

        # Print results
        print('\t{} train: {}'.format(train_title, acc_train))
        print('\t{} test: {}\n'.format(test_title, acc_test))

    # Save the best model (if required)
    if output_path is not None:
        with open(output_path, 'wb') as filename:
            pickle.dump(best_estimator, filename)

In [5]:
def train_supervised_models(estimator, train_dict, test_dict, output_path=None):
    '''
    Trains a supervised classification model on multiple training/testing data sets

    Args:
    - estimator: model to be trained
    - train_dict (dict): dictionary of training data
    - test_dict (dict): dictionary of training data
    - output_path=None (str): location to save the best model to

    The training and testing dictionaries are of the form
    'data key': (X, y)
    where X corresponds to features to be trained on, and y corresponds to labels
    '''

    # Print which model we're training on
    model_type = estimator.__class__.__name__
    print(model_type)

    # Variables for saving best model
    best_acc = 0.0
    best_estimator = None

    for (train_title, (X_train, y_train)), (test_title, (X_test, y_test)) in zip(train_dict.items(), test_dict.items()):
        # Ensure the correct data is being trained/tested together
        assert train_title == test_title, 'Data is mismatched for train/test split'

        # Train model and gather accuracies
        estimator.fit(X_train, y_train)
        acc_train = return_predict_acc(estimator, X_train, y_train)
        acc_test = return_predict_acc(estimator, X_test, y_test)

        # Update best model
        if acc_test > best_acc:
            best_acc = acc_test
            best_estimator = copy.deepcopy(estimator)

        # Print results
        print('\t{} train: {}'.format(train_title, acc_train))
        print('\t{} test: {}\n'.format(test_title, acc_test))

    # Save the best model (if required)
    if output_path is not None:
        with open(output_path, 'wb') as filename:
            pickle.dump(best_estimator, filename)

In [6]:
def train_models(estimator, train_dict, test_dict, is_supervised=False, output_path=None):
    '''
    Passes data to respective function for model training

    Args:
    - estimator: model to be trained
    - train_dict (dict): dictionary of training data
    - test_dict (dict): dictionary of training data
    - is_supervised (bool): specifies is model is supervised, for purpose of switching labels
    - output_path=None (str): location to save the best model to

    The training and testing dictionaries are of the form
    'data key': (X, y)
    where X corresponds to features to be trained on, and y corresponds to labels
    '''
    if is_supervised:
        train_supervised_models(estimator, train_dict, test_dict, output_path)
    else: 
        train_unsupervised_models(estimator, train_dict, test_dict, output_path)

## Variable Configuration

In [7]:
RANDOM_STATE=42

# Pre-processing

## Read, Edit, Partition Data

In [8]:
#from google.colab import drive
#drive.mount('/content/drive')

## TF-IDF

This cell reads in the full data, and utilizes TF-IDF scores for the top 2000 words in the vocabulary for the article titles. Additional cleaning is done to ensure that there are no empty titles included in the analysis.

In [9]:
vectorizer = TfidfVectorizer(sublinear_tf = True,
                            analyzer = 'word',
                            stop_words = 'english',
                            max_features = 2000,
                            tokenizer = word_tokenize)

#df_labeled = pd.read_csv('/content/drive/MyDrive/Cogs 118b Final Project/df_labeled.csv')
df_labeled = pd.read_csv('df_labeled.csv')
df_labeled = df_labeled.dropna(subset=['title'])

X_tfidf = vectorizer.fit_transform(df_labeled['title']).toarray()
y = df_labeled['label'].to_numpy()

## Add Sentiment

In [10]:
X_combined = np.hstack([X_tfidf, df_labeled['compound'].to_numpy().reshape(-1,1)])

## Apply PCA

In [11]:
pca = PCA(n_components=100)
X_tfidf_pca = pca.fit_transform(X_tfidf)
X_combined_pca = pca.fit_transform(X_combined)

## Split data

In [12]:
# TF-IDF
X_tfidf_train, \
X_tfidf_test, \
y_tfidf_train, \
y_tfidf_test = train_test_split(X_tfidf, y, train_size=0.8, random_state=RANDOM_STATE, shuffle=True)

# TF-IDF, PCA
X_tfidf_pca_train, \
X_tfidf_pca_test, \
y_tfidf_pca_train, \
y_tfidf_pca_test = train_test_split(X_tfidf_pca, y, train_size=0.8, random_state=RANDOM_STATE, shuffle=True)

# TF-IDF, Sentiment
X_combined_train, \
X_combined_test, \
y_combined_train, \
y_combined_test = train_test_split(X_combined, y, train_size=0.8, random_state=RANDOM_STATE, shuffle=True)

# TF-IDF, Sentiment, PCA
X_combined_pca_train, \
X_combined_pca_test, \
y_combined_pca_train, \
y_combined_pca_test = train_test_split(X_combined_pca, y, train_size=0.8, random_state=RANDOM_STATE, shuffle=True)

In [13]:
X_tfidf_train.shape, X_tfidf_test.shape

((35911, 2000), (8978, 2000))

In [14]:
train_dict = {'tfidf': (X_tfidf_train, y_tfidf_train), 
                 'tfidf/pca': (X_tfidf_pca_train, y_tfidf_pca_train), 
                 'tfidf/sentiment': (X_combined_train, y_combined_train), 
                 'tfidf/sentiment/pca': (X_combined_pca_train, y_combined_pca_train)}

test_dict = {'tfidf': (X_tfidf_test, y_tfidf_test), 
                'tfidf/pca': (X_tfidf_pca_test, y_tfidf_pca_test), 
                'tfidf/sentiment': (X_combined_test, y_combined_test), 
                'tfidf/sentiment/pca': (X_combined_pca_test, y_combined_pca_test)}

# Classify Data

## SVM

In [15]:
#svc = SVC()
#train_models(svc, train_dict, test_dict, is_supervised=True)

## Random Forest

In [16]:
#rf = RandomForestClassifier()
#train_models(rf, train_dict, test_dict, is_supervised=True)

## K-Means

In [17]:
km = KMeans(n_clusters=2)
train_models(km, train_dict, test_dict, output_path='kmeans.pkl')

KMeans
	tfidf train: 0.6063044749519646
	tfidf test: 0.6041434617955002

	tfidf/pca train: 0.6063044749519646
	tfidf/pca test: 0.6041434617955002

	tfidf/sentiment train: 0.5957227590431902
	tfidf/sentiment test: 0.5975718422811317

	tfidf/sentiment/pca train: 0.5957227590431902
	tfidf/sentiment/pca test: 0.5975718422811316



## Mixture of Gaussian

In [18]:
#mog = GaussianMixture(n_components=2)
#train_models(mog, train_dict, test_dict, output_path='mog.pkl')

# Results

```
% Please add the following required packages to your document preamble:
% \usepackage{booktabs}

\begin{table}[]
\begin{tabular}{@{}ll@{}}
\toprule
Model   & Perfomance \\ \midrule
K-Means & 76.92\%    \\
MOG     & 
SVM     & 95.17\%    \\
RF      & 94.16\%    \\ \bottomrule
\end{tabular}
\end{table}
```