In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /Users/wjones/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Data Preprocessing
path = "RottenTomatoes/DataSet/train.tsv"
df = pd.read_csv(path, sep="\t") 

# remove punctuation
df['Phrase'] = df['Phrase'].str.replace(r'[^\w\s]+', '')
# remove numbers
df['Phrase'] = df['Phrase'].str.replace(r'\d+', '')
# make it all lower case
df['Phrase'] = df['Phrase'].str.lower()
# remove non-asci characters
df.Phrase.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Phrase'] = df['Phrase'].str.split()

df['Tokenized_text'] = df['Phrase'].apply(word_tokenize) 

df['Sentiment']=df['Sentiment'].astype(int) #convert the star_rating column to int
df['NNLabels'] = df['Sentiment'].div(4)

df= df[df['Sentiment']!=2]

#df['label']=np.where(df['Sentiment']>=4,1,0) #1-Positve,0-Negative
# convert to NumPy Array
train = df['Phrase'].to_numpy()


  df['Phrase'] = df['Phrase'].str.replace(r'[^\w\s]+', '')
  df['Phrase'] = df['Phrase'].str.replace(r'\d+', '')


In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train, Y_test = train_test_split(df['Phrase'], df['Sentiment'], test_size=0.2, random_state=30)
print("Train: " ,X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))

Train:  (61182,) (61182,) Test:  ((15296,), (15296,))


In [5]:
X_train,X_valid,Y_train, Y_valid = train_test_split(X_train,Y_train, test_size=0.5, random_state=30)
print("Train: " ,X_train.shape,Y_train.shape,"Test: ",(X_valid.shape,Y_valid.shape))

Train:  (30591,) (30591,) Test:  ((30591,), (30591,))


In [6]:
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [7]:
countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
tf_x_train = tfidfvectorizer.fit_transform(X_train)
tf_x_valid = tfidfvectorizer.transform(X_valid)

# Linear SVC
- This is a model that draws hyperplanes to cluster the data. It is aware of the labels.

In [8]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

clf = LinearSVC(random_state=0)
clf.fit(tf_x_train,Y_train)
y_test_pred=clf.predict(tf_x_valid)

report=classification_report(Y_valid, y_test_pred,output_dict=True)
report

{'0': {'precision': 0.4579002079002079,
  'recall': 0.30944854232525465,
  'f1-score': 0.36931460909662545,
  'support': 2847},
 '1': {'precision': 0.690320874471086,
  'recall': 0.7175845322092916,
  'f1-score': 0.7036887271420227,
  'support': 10913},
 '3': {'precision': 0.6874407261888633,
  'recall': 0.7746564885496183,
  'f1-score': 0.7284473476419496,
  'support': 13100},
 '4': {'precision': 0.5048809058961343,
  'recall': 0.346555883141249,
  'f1-score': 0.41099809281627464,
  'support': 3731},
 'accuracy': 0.6587885325749403,
 'macro avg': {'precision': 0.5851356786140729,
  'recall': 0.5370613615563534,
  'f1-score': 0.5531121941742181,
  'support': 30591},
 'weighted avg': {'precision': 0.6448399126529842,
  'recall': 0.6587885325749403,
  'f1-score': 0.6474743522541285,
  'support': 30591}}

## Accuracy on Rotton Tomatoes:
- As we see, this has a nearly 65% accuracy on the subjectivity dataset. 

# Demo:

In [9]:
sentance = ["This is the absolute best thing ever"] # put your sentance here, it will predict the sentiment
sent = tfidfvectorizer.transform(sentance)
sent_prediction = clf.predict(sent)
print("The sentiment of the input is a " + sent_prediction[0].astype(str)+" out of 4")

The sentiment of the input is a 4 out of 4


# Cross-Testing

In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
path = "clean_IMDBdataset.csv"
df = pd.read_csv(path, sep=",")

[nltk_data] Downloading package punkt to /Users/wjones/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
df['Sentiment'] = df['Sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
X_train,X_test,Y_train, Y_test = train_test_split(df['Phrase'], df['Sentiment'], test_size=0.2, random_state=30)
print("Train: " ,X_train.shape,Y_train.shape,"Test/Valid: ",(X_test.shape,Y_test.shape))
X_test,X_valid,Y_test, Y_valid = train_test_split(X_test,Y_test, test_size=0.5, random_state=30)
print("Test: " ,X_test.shape,Y_test.shape,"Valid: ",(X_valid.shape,Y_valid.shape))
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
tf_x_train = tfidfvectorizer.fit_transform(X_train)
tf_x_valid = tfidfvectorizer.transform(X_valid)

Train:  (40000,) (40000,) Test/Valid:  ((10000,), (10000,))
Test:  (5000,) (5000,) Valid:  ((5000,), (5000,))


In [12]:
print(tf_x_valid.shape)

(5000, 185967)


In [13]:
y_test_pred=clf.predict(tf_x_valid[:,:14320])
y_test_pred



array([3, 1, 3, ..., 1, 3, 3])

In [14]:
y_test_pred[y_test_pred <= 2] = 0
y_test_pred[y_test_pred > 2] = 1



In [15]:
report=classification_report(Y_valid, y_test_pred,output_dict=True)
report

{'0': {'precision': 0.46086261980830673,
  'recall': 0.22988047808764941,
  'f1-score': 0.3067517278043594,
  'support': 2510},
 '1': {'precision': 0.4842582710779082,
  'recall': 0.7289156626506024,
  'f1-score': 0.5819172811798653,
  'support': 2490},
 'accuracy': 0.4784,
 'macro avg': {'precision': 0.4725604454431075,
  'recall': 0.4793980703691259,
  'f1-score': 0.4443345044921123,
  'support': 5000},
 'weighted avg': {'precision': 0.4725136541405683,
  'recall': 0.4784,
  'f1-score': 0.44378417338536136,
  'support': 5000}}

# Results
- I was suprised that cross testing did not generate similar results, but I chalk it up to the differences in the datasets -- the rotten tomatoes included many more words and terms, where as the IMBD dataset was long phrases. I also had to remove a lot of features from the IMBD dataset to make it compadible with the original dataset. 
- I believe that training on both datasets would potentially generate better results. 

# Performance on IMBD dataset:

In [16]:
clf2 = LinearSVC(random_state=0)
clf2.fit(tf_x_train,Y_train)
y_test_pred2=clf2.predict(tf_x_valid)

report=classification_report(Y_valid, y_test_pred2,output_dict=True)
report

{'0': {'precision': 0.9147254575707154,
  'recall': 0.8760956175298805,
  'f1-score': 0.894993894993895,
  'support': 2510},
 '1': {'precision': 0.8802003081664098,
  'recall': 0.9176706827309237,
  'f1-score': 0.8985450255603618,
  'support': 2490},
 'accuracy': 0.8968,
 'macro avg': {'precision': 0.8974628828685627,
  'recall': 0.8968831501304021,
  'f1-score': 0.8967694602771283,
  'support': 5000},
 'weighted avg': {'precision': 0.8975319331673712,
  'recall': 0.8968,
  'f1-score': 0.8967623580159955,
  'support': 5000}}

In [17]:
clf1 = LinearSVC(random_state=0)
clf1.fit(tf_x_train[:,:14320],Y_train)
y_test_pred1=clf1.predict(tf_x_valid[:,:14320])

report=classification_report(Y_valid, y_test_pred1,output_dict=True)
report

{'0': {'precision': 0.7578558225508318,
  'recall': 0.6533864541832669,
  'f1-score': 0.7017543859649122,
  'support': 2510},
 '1': {'precision': 0.6932299012693935,
  'recall': 0.7895582329317269,
  'f1-score': 0.7382651145324821,
  'support': 2490},
 'accuracy': 0.7212,
 'macro avg': {'precision': 0.7255428619101127,
  'recall': 0.7214723435574969,
  'f1-score': 0.7200097502486972,
  'support': 5000},
 'weighted avg': {'precision': 0.7256721137526756,
  'recall': 0.7212,
  'f1-score': 0.719936728791562,
  'support': 5000}}

# Analysis:
- as we can see here, this is really good at polarity predicitions. That makes sense, as it only has to draw one "hyperplane" and there is likley a strong line that can be drawn.

# Cross-Testing on Rotten Tomatoes:

In [18]:
# Data Preprocessing
path = "RottenTomatoes/DataSet/train.tsv"
df = pd.read_csv(path, sep="\t") 

# remove punctuation
df['Phrase'] = df['Phrase'].str.replace(r'[^\w\s]+', '')
# remove numbers
df['Phrase'] = df['Phrase'].str.replace(r'\d+', '')
# make it all lower case
df['Phrase'] = df['Phrase'].str.lower()
# remove non-asci characters
df.Phrase.replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)
#df['Phrase'] = df['Phrase'].str.split()

df['Tokenized_text'] = df['Phrase'].apply(word_tokenize) 

df['Sentiment']=df['Sentiment'].astype(int) #convert the star_rating column to int
df['NNLabels'] = df['Sentiment'].div(4)

df= df[df['Sentiment']!=2]

#df['label']=np.where(df['Sentiment']>=4,1,0) #1-Positve,0-Negative
# convert to NumPy Array
train = df['Phrase'].to_numpy()


  df['Phrase'] = df['Phrase'].str.replace(r'[^\w\s]+', '')
  df['Phrase'] = df['Phrase'].str.replace(r'\d+', '')


In [19]:

df['Sentiment'] = df['Sentiment'].apply(lambda x: 0 if x < 2 else 1)
X_train,X_test,Y_train, Y_test = train_test_split(df['Phrase'], df['Sentiment'], test_size=0.2, random_state=30)
print("Train: " ,X_train.shape,Y_train.shape,"Test: ",(X_test.shape,Y_test.shape))
X_train,X_valid,Y_train, Y_valid = train_test_split(X_train,Y_train, test_size=0.5, random_state=30)
print("Train: " ,X_train.shape,Y_train.shape,"Test: ",(X_valid.shape,Y_valid.shape))
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
tf_x_train = tfidfvectorizer.fit_transform(X_train)
tf_x_valid = tfidfvectorizer.transform(X_valid)

Train:  (61182,) (61182,) Test:  ((15296,), (15296,))
Train:  (30591,) (30591,) Test:  ((30591,), (30591,))


In [20]:
preds = clf1.predict(tf_x_valid)
report=classification_report(Y_valid, preds,output_dict=True)
report

{'0': {'precision': 0.4570247933884298,
  'recall': 0.28132267441860465,
  'f1-score': 0.34826810616284304,
  'support': 13760},
 '1': {'precision': 0.5529587270014918,
  'recall': 0.7267542035529677,
  'f1-score': 0.6280550421031013,
  'support': 16831},
 'accuracy': 0.5263966526102448,
 'macro avg': {'precision': 0.5049917601949608,
  'recall': 0.5040384389857862,
  'f1-score': 0.4881615741329721,
  'support': 30591},
 'weighted avg': {'precision': 0.5098071161840705,
  'recall': 0.5263966526102448,
  'f1-score': 0.5022053399509012,
  'support': 30591}}

# Analysis:
- Interestingly, the rotten tomatoes dataset cross-test isn't great. I imagine this has to do with the rotten tomatoes dataset not being as "cut and dry" as the IMDB dataset, that is it has a lot more neutral reviews in it. Even removing all the most "neutral" (#2s) that still leaves 1's and 3's which are not super negative or super positive.