# Project 2: Movie Review Sentiment Analysis

In [12]:
from sklearn.model_selection import train_test_split
from scipy.sparse import load_npz

## Importing Features

In [13]:
# Import our features and their features here
their_data = load_npz('../data/origfeat.npz')

In [14]:
their_y = their_data[:, 0].toarray()

In [15]:
their_features = their_data[:, 1:]

## Dimensionality Reduction

In [52]:
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

In [25]:
our_pca = PCA(n_components = 3)
their_pca = TruncatedSVD(n_components = 250)

our_reduced_features = our_pca.fit_transform(our_features)
their_reduced_features = their_pca.fit_transform(their_features)

## Building models

In [7]:
from linear_models.lda import LDAClassifier
from linear_models.SGD import SGDClassifier

In [43]:
# We can tweek model parameters here
our_lda = LDAClassifier()
their_lda = LDAClassifier()

# Default params for SVM and logistic regression do not "converge".
# It actually does converge, but not to a point where the algorithm will
# stop based on the default epsilon Changing the epsilon value to be more
# lenient will result in faster training times
our_svm = SGDClassifier()
their_svm = SGDClassifier()

our_log = SGDClassifier(loss='log')
their_log = SGDClassifier(loss='log')

In [41]:
ox_train, ox_test, oy_train, oy_test = train_test_split(
    our_reduced_features, 
    our_y, 
    test_size=0.33,
    random_state=42)

tx_train, tx_test, ty_train, ty_test = train_test_split(
    their_reduced_features,
    their_y,
    test_size=0.33,
    random_state=42)

In [44]:

our_lda.fit(ox_train, oy_train)
their_lda.fit(tx_train, ty_train)

# Can pass verbose = int for varying levels of verbosity in the fit functions
# verbose = 1: Gives training time per epoch and total training time
# verbose >= 2: Not really supported for high-dimensional data
our_svm.fit(ox_train, oy_train)
their_svm.fit(tx_train, ty_train)

our_log.fit(ox_train, oy_train)
their_log.fit(tx_train, ty_train)

SGD did not converge after 1000 epochs. Increase max_iters for a better model.
SGD did not converge after 1000 epochs. Increase max_iters for a better model.


<linear_models.SGD.SGDClassifier at 0x117660c18>

In [47]:
def get_confusion_matrix(y_true, y_pred):
    y_true = pd.Series(y_true, name="Actual")
    y_pred = pd.Series(y_pred, name="Predicted")
    return pd.crosstab(y_true, y_pred)

In [48]:
import pandas as pd

## Aggregation of Metrics

In [None]:
our_lda_score = our_lda.score(ox_test, oy_test)
our_predictions = our_lda.predict(ox_test)
our_lda_confusion = get_confusion_matrix(
    oy_test.reshape(our_predictions.shape),
    our_predictions)
their_lda_score = their_lda.score(tx_test, ty_test)
their_predictions = their_lda.predict(tx_test)
their_lda_confusion = get_confusion_matrix(
    ty_test.reshape(our_predictions.shape),
    their_predictions)
print('================================')
print('* LDA Cross Validation Metrics *')
print('================================')
print(f'Accuracy w/ our Features: {our_lda_score}')
print(f'Accuracy w/ their Features: {their_lda_score}')
print(f'Confusion Matrix w/ our Features:\n{our_lda_confusion}')
print(f'Confusion Matrix w/ their Features:\n{their_lda_confusion}')

In [50]:
our_svm_score = our_svm.score(ox_test, oy_test)
our_predictions = our_svm.predict(ox_test)
our_svm_confusion = get_confusion_matrix(oy_test.reshape(our_predictions.shape), our_predictions)
their_svm_score = their_svm.score(tx_test, ty_test)
their_predictions = their_svm.predict(tx_test)
their_svm_confusion = get_confusion_matrix(
    ty_test.reshape(their_predictions.shape),
    their_predictions)
print('================================')
print('* SVM Cross Validation Metrics *')
print('================================')
print(f'Accuracy w/ our Features: {our_svm_score}')
print(f'Accuracy w/ their Features: {their_svm_score}')
print(f'Confusion Matrix w/ our Features:\n{our_svm_confusion}')
print(f'Confusion Matrix w/ their Features:\n{their_svm_confusion}')

* SVM Cross Validation Metrics *
Accuracy w/ their Features: 0.7244848484848485
Confusion Matrix w/ their Features:
Predicted     0     1
Actual               
0          2460  1673
1           600  3517


In [51]:
our_log_score = our_log.score(ox_test, oy_test)
our_predictions = our_log.predict(ox_test)
our_log_confusion = get_confusion_matrix(oy_test.reshape(our_predictions.shape), our_predictions)
their_log_score = their_log.score(tx_test, ty_test)
their_predictions = their_log.predict(tx_test)
their_svm_confusion = get_confusion_matrix(
    ty_test.reshape(our_predictions.shape),
    their_predictions)
print('================================')
print('* LDA Cross Validation Metrics *')
print('================================')
print(f'Accuracy w/ our Features: {our_log_score}')
print(f'Accuracy w/ their Features: {their_log_score}')
print(f'Confusion Matrix w/ our Features:\n{our_log_confusion}')
print(f'Confusion Matrix w/ their Features:\n{their_log_confusion}')

NameError: name 'ox_test' is not defined