In [39]:
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import re

In [40]:
import pandas as pd
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/wjones/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/wjones/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [41]:
path = "RottenTomatoes/DataSet/train.tsv"
df = pd.read_csv(path, sep="\t") 

In [42]:
def clean_sentences(df):
    
    reviews = []
    for sent in tqdm(df['Phrase']):
        # removing non-alphabetical characters 
        text = re.sub("[^a-zA-Z]"," ",sent)
        
        # Now tokenizing the sentence : 
        words = word_tokenize(text.lower())
        
        #removing stop words :
        new_words = [ ele for ele in words if ele.lower() not in stopwords.words('english') ]
        
        # Lemmatizing each word to its lemma
        lem = WordNetLemmatizer()
        lem_words = [lem.lemmatize(i) for i in new_words]
        
        #finally
        reviews.append(lem_words)
        
    return(reviews)

In [43]:
train_sentences = clean_sentences(df)
#test_sentences = clean_sentences()

print(len(train_sentences))
#print(len(test_sentences))

100%|██████████| 156060/156060 [01:08<00:00, 2286.52it/s]

156060





In [44]:
from keras.utils import to_categorical

y_target = to_categorical(df['Sentiment'].values)


In [45]:
from sklearn.model_selection import train_test_split


In [46]:
X_train,X_val,y_train,y_val = train_test_split(df['Phrase'], df['Sentiment'],test_size = 0.2,stratify = y_target)

In [47]:
from tqdm import tqdm
unique_words = set()
len_max = -1

for sent in tqdm(X_train):
    unique_words.update(sent)
    if(len_max < len(sent)):
        len_max = len(sent)

print('Words in vocab : ' , len(list(unique_words)))
print('Max_length : ' , len_max)

100%|██████████| 124848/124848 [00:00<00:00, 1443744.51it/s]

Words in vocab :  80
Max_length :  283





In [48]:
vocab_size = len(list(unique_words))
embedding_dim = 300
max_length = len_max
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'

In [49]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

In [50]:
tokenizer = Tokenizer(num_words = vocab_size,
                      # filters = '#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      oov_token = oov_tok,
                      # lower = True,
                      char_level = False)

tokenizer.fit_on_texts(list(X_train))

# Training
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,
                        maxlen = max_length,
                        padding = padding_type,
                        truncating = trunc_type)

# Validation
X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val,
                      maxlen = max_length,
                      padding = padding_type,
                      truncating = trunc_type)

# Testing
X_test = tokenizer.texts_to_sequences(df['Phrase'])
X_test = pad_sequences(X_test,
                       maxlen = max_length,
                       padding = padding_type,
                       truncating = trunc_type)

100%|██████████| 61182/61182 [00:00<00:00, 1019040.94it/s]

Words in vocab :  27
Max_length :  265





  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [51]:
from sklearn.cluster import KMeans


In [61]:
kmeans = KMeans(init="k-means++", n_clusters=4, random_state=0)
kmeans.fit(X_train)
preds = kmeans.predict(X_val)

In [62]:
from sklearn.metrics import classification_report
report=classification_report(y_val, preds,output_dict=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [63]:
report

{'0': {'precision': 0.05037593984962406,
  'recall': 0.09476661951909476,
  'f1-score': 0.06578301423662249,
  'support': 1414},
 '1': {'precision': 0.24616482340349624,
  'recall': 0.2529789184234647,
  'f1-score': 0.24952535937076212,
  'support': 5455},
 '2': {'precision': 0.4882758620689655,
  'recall': 0.08896707715506409,
  'f1-score': 0.15051020408163265,
  'support': 15916},
 '3': {'precision': 0.19949117030829094,
  'recall': 0.6071970847251746,
  'f1-score': 0.300315410033043,
  'support': 6586},
 '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1841},
 'accuracy': 0.2219979495066,
 'macro avg': {'precision': 0.19686155912607534,
  'recall': 0.20878193996455963,
  'f1-score': 0.15322679754441207,
  'support': 31212},
 'weighted avg': {'precision': 0.3363869075661124,
  'recall': 0.2219979495066,
  'f1-score': 0.1867094616217794,
  'support': 31212}}

In [65]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)
clf.fit(X_train,y_train)



LinearSVC(random_state=0)

In [66]:
y_test_pred=clf.predict(X_val)


In [67]:
report=classification_report(y_val, y_test_pred,output_dict=True)

In [68]:
report

{'0': {'precision': 0.10344827586206896,
  'recall': 0.006364922206506365,
  'f1-score': 0.011992005329780148,
  'support': 1414},
 '1': {'precision': 0.23849453978159127,
  'recall': 0.22419798350137488,
  'f1-score': 0.23112538977605593,
  'support': 5455},
 '2': {'precision': 0.5510287560845911,
  'recall': 0.8463810002513195,
  'f1-score': 0.6674925055124743,
  'support': 15916},
 '3': {'precision': 0.26106696935300794,
  'recall': 0.034922563012450655,
  'f1-score': 0.061604392661041917,
  'support': 6586},
 '4': {'precision': 0.08819133034379671,
  'recall': 0.03204780010863661,
  'f1-score': 0.04701195219123506,
  'support': 1841},
 'accuracy': 0.4803280789439959,
 'macro avg': {'precision': 0.2484459742850112,
  'recall': 0.2287828538160576,
  'f1-score': 0.20384524909411747,
  'support': 31212},
 'weighted avg': {'precision': 0.38764528251126934,
  'recall': 0.4803280789439959,
  'f1-score': 0.3970854782952365,
  'support': 31212}}